# Import libraries

In [1]:
import pandas as pd
import plotly.graph_objs as go
import random

# Import csv

In [2]:
df = pd.read_csv('./EuroMillions.csv', sep=";")
df

Unnamed: 0,Date,N1,N2,N3,N4,N5,E1,E2,Winner,Gain
0,2004-01-10,2,4,31,17,1,9,3,0,0
1,2004-01-10,19,28,45,15,34,12,10,0,0
2,2004-01-10,20,5,50,38,48,1,4,0,0
3,2004-01-10,8,33,50,48,13,12,1,0,0
4,2004-01-10,48,20,45,21,12,6,5,1,29785269
...,...,...,...,...,...,...,...,...,...,...
6585,2021-12-01,36,43,17,25,38,11,5,0,0
6586,2021-12-01,17,39,41,25,14,4,9,0,0
6587,2021-12-01,10,38,32,4,3,9,5,0,0
6588,2021-12-01,17,5,7,2,26,2,8,0,0


# Get the win numbers

In [3]:
dfWin = df[df['Gain'] != 0].reset_index(drop=True)
dfWin

Unnamed: 0,Date,N1,N2,N3,N4,N5,E1,E2,Winner,Gain
0,2004-01-10,48,20,45,21,12,6,5,1,29785269
1,2004-02-13,32,16,29,41,36,9,7,1,15000000
2,2004-04-06,41,42,34,13,9,7,3,1,12488171
3,2004-05-03,39,37,4,7,33,5,1,1,20678070
4,2004-05-14,32,3,1,39,21,2,6,1,33816137
...,...,...,...,...,...,...,...,...,...,...
1313,2021-01-29,1,5,27,36,42,1,6,0,30681161
1314,2021-02-02,18,20,35,38,48,9,12,0,41721143
1315,2021-05-01,12,34,37,38,42,1,3,0,17000000
1316,2021-08-01,18,23,37,41,42,4,6,0,30824040


# Count all the win numbers appeared

In [4]:
# count number of wins
count = dfWin['N1'].value_counts() + dfWin['N2'].value_counts() + dfWin['N3'].value_counts() + dfWin['N4'].value_counts() + dfWin['N5'].value_counts()

In [5]:
# draw the chart for count
fig = go.Figure()
fig.add_trace(go.Bar(x=count.index, y=count.values, name='Win'))
fig.show()

# Verify random

First of all, we are going to verify that the draws are indeed random draws.    
Here, as with the dataset provided to us, we will generate 1318 random draws of 5 numbers between 1 and 50.     
We will then compare these prints to the dataset.

In [6]:
def generateDraws(nbrDraw):
    dfDraw = pd.DataFrame(columns=['N1','N2','N3','N4','N5'])
    for i in range(nbrDraw):
        draw = random.sample(range(1, 51), 5)
        row = {}
        row['N1'] = draw[0]
        row['N2'] = draw[1]
        row['N3'] = draw[2]
        row['N4'] = draw[3]
        row['N5'] = draw[4]
        dfDraw = dfDraw.append(row, ignore_index=True)
    return dfDraw

dfLittleDraw = generateDraws(1318)

In [7]:
# count values in dataframes
countLittle = dfLittleDraw['N1'].value_counts() + dfLittleDraw['N2'].value_counts() + dfLittleDraw['N3'].value_counts() + dfLittleDraw['N4'].value_counts() + dfLittleDraw['N5'].value_counts()
countDfWin = dfWin['N1'].value_counts() + dfWin['N2'].value_counts() + dfWin['N3'].value_counts() + dfWin['N4'].value_counts() + dfWin['N5'].value_counts()
# trace the chart
fig = go.Figure()
fig.add_trace(go.Bar(x=countLittle.index, y=countLittle.values, name='Random'))
fig.add_trace(go.Bar(x=countDfWin.index, y=countDfWin.values, name='Datasource'))
fig.show()

We can observe that the numbers from the datasource are similar to the data of the random generated on a computer and therefore very reliable.

# Convergence

In order to analyze the convergence of the draws, we are going to generate 100,000 more random draws of 5 numbers.

In [11]:
dfManyDraw = pd.concat([dfLittleDraw, generateDraws(100000)])

In [12]:
# count values in dataframe
countMany = dfManyDraw['N1'].value_counts() + dfManyDraw['N2'].value_counts() + dfManyDraw['N3'].value_counts() + dfManyDraw['N4'].value_counts() + dfManyDraw['N5'].value_counts()

# normalize counts
countLittle = countLittle / countLittle.max()
countMany = countMany / countMany.max()

# trace the chart
fig = go.Figure()
fig.add_trace(go.Bar(x=countLittle.index, y=countLittle.values, name='1,318'))
fig.add_trace(go.Bar(x=countMany.index, y=countMany.values, name='101,318'))
fig.show()

As can be seen in this comparison, the count of numbers drawn tend to stabilize, and the gap is narrowing.

# Conclusion
We can notice that the counters of the numbers drawn are for the moment slightly unbalanced, but will gradually stabilize.    
Therefore, we can conclude that the weakly drawn numbers will be drawn more in the future than those which have often been drawn.