In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns
from matplotlib.ticker import FuncFormatter

In [2]:
df = pd.read_csv("SkillCraft1_Dataset.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3395 entries, 0 to 3394
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   GameID                3395 non-null   int64  
 1   LeagueIndex           3395 non-null   int64  
 2   Age                   3395 non-null   object 
 3   HoursPerWeek          3395 non-null   object 
 4   TotalHours            3395 non-null   object 
 5   APM                   3395 non-null   float64
 6   SelectByHotkeys       3395 non-null   float64
 7   AssignToHotkeys       3395 non-null   float64
 8   UniqueHotkeys         3395 non-null   int64  
 9   MinimapAttacks        3395 non-null   float64
 10  MinimapRightClicks    3395 non-null   float64
 11  NumberOfPACs          3395 non-null   float64
 12  GapBetweenPACs        3395 non-null   float64
 13  ActionLatency         3395 non-null   float64
 14  ActionsInPAC          3395 non-null   float64
 15  TotalMapExplored     

###### We observe here that three columns are of object type. Something is wrong with these columns and it must be corrected.# 

In [3]:
df

Unnamed: 0,GameID,LeagueIndex,Age,HoursPerWeek,TotalHours,APM,SelectByHotkeys,AssignToHotkeys,UniqueHotkeys,MinimapAttacks,MinimapRightClicks,NumberOfPACs,GapBetweenPACs,ActionLatency,ActionsInPAC,TotalMapExplored,WorkersMade,UniqueUnitsMade,ComplexUnitsMade,ComplexAbilitiesUsed
0,52,5,27,10,3000,143.7180,0.003515,0.000220,7,0.000110,0.000392,0.004849,32.6677,40.8673,4.7508,28,0.001397,6,0.000000,0.000000
1,55,5,23,10,5000,129.2322,0.003304,0.000259,4,0.000294,0.000432,0.004307,32.9194,42.3454,4.8434,22,0.001193,5,0.000000,0.000208
2,56,4,30,10,200,69.9612,0.001101,0.000336,4,0.000294,0.000461,0.002926,44.6475,75.3548,4.0430,22,0.000745,6,0.000000,0.000189
3,57,3,19,20,400,107.6016,0.001034,0.000213,1,0.000053,0.000543,0.003783,29.2203,53.7352,4.9155,19,0.000426,7,0.000000,0.000384
4,58,3,32,10,500,122.8908,0.001136,0.000327,2,0.000000,0.001329,0.002368,22.6885,62.0813,9.3740,15,0.001174,4,0.000000,0.000019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3390,10089,8,?,?,?,259.6296,0.020425,0.000743,9,0.000621,0.000146,0.004555,18.6059,42.8342,6.2754,46,0.000877,5,0.000000,0.000000
3391,10090,8,?,?,?,314.6700,0.028043,0.001157,10,0.000246,0.001083,0.004259,14.3023,36.1156,7.1965,16,0.000788,4,0.000000,0.000000
3392,10092,8,?,?,?,299.4282,0.028341,0.000860,7,0.000338,0.000169,0.004439,12.4028,39.5156,6.3979,19,0.001260,4,0.000000,0.000000
3393,10094,8,?,?,?,375.8664,0.036436,0.000594,5,0.000204,0.000780,0.004346,11.6910,34.8547,7.9615,15,0.000613,6,0.000000,0.000631


###### We can see that some players have not given any information and therefore their value indicated by the question mark is missing#

###### So we will clean the dataset.
###### To do this, we will remove the missing values from the dataset, then calculate the average of each column that has a problem, then assign the average to the rows that have a question mark.

In [4]:
df.isin(['?']).sum(axis=0)

GameID                   0
LeagueIndex              0
Age                     55
HoursPerWeek            56
TotalHours              57
APM                      0
SelectByHotkeys          0
AssignToHotkeys          0
UniqueHotkeys            0
MinimapAttacks           0
MinimapRightClicks       0
NumberOfPACs             0
GapBetweenPACs           0
ActionLatency            0
ActionsInPAC             0
TotalMapExplored         0
WorkersMade              0
UniqueUnitsMade          0
ComplexUnitsMade         0
ComplexAbilitiesUsed     0
dtype: int64

In [5]:
df.loc[df['Age'] == '?']

Unnamed: 0,GameID,LeagueIndex,Age,HoursPerWeek,TotalHours,APM,SelectByHotkeys,AssignToHotkeys,UniqueHotkeys,MinimapAttacks,MinimapRightClicks,NumberOfPACs,GapBetweenPACs,ActionLatency,ActionsInPAC,TotalMapExplored,WorkersMade,UniqueUnitsMade,ComplexUnitsMade,ComplexAbilitiesUsed
3340,10001,8,?,?,?,189.7404,0.004582,0.000655,4,7.3e-05,0.000618,0.006291,23.513,32.5665,4.4451,25,0.002218,6,0.0,0.0
3341,10005,8,?,?,?,287.8128,0.02904,0.001041,9,0.000231,0.000656,0.005399,31.6416,36.1143,4.5893,34,0.001138,6,5.8e-05,0.0
3342,10006,8,?,?,?,294.0996,0.02964,0.001076,6,0.000302,0.002374,0.006294,16.6393,36.8192,4.185,26,0.000987,6,0.0,0.0
3343,10015,8,?,?,?,274.2552,0.018121,0.001264,8,5.3e-05,0.000975,0.007111,10.6419,24.3556,4.387,28,0.001106,6,0.0,0.0
3344,10016,8,?,?,?,274.3404,0.023131,0.000739,8,0.000622,0.003552,0.005355,19.1568,36.3098,5.2811,28,0.000739,6,0.0,0.0
3345,10017,8,?,?,?,245.8188,0.010471,0.000841,10,0.000657,0.001314,0.005031,14.5518,36.7134,7.1943,33,0.001474,11,4e-05,4.8e-05
3346,10018,8,?,?,?,211.0722,0.013049,0.00094,10,0.000366,0.000909,0.003719,19.6169,38.9326,7.132,23,0.000898,9,0.0,0.0
3347,10021,8,?,?,?,189.5778,0.007559,0.000487,10,0.000606,0.000566,0.005821,22.0317,36.733,4.905,28,0.00054,5,0.0,0.0
3348,10022,8,?,?,?,210.5088,0.007974,0.000867,7,0.000548,0.000638,0.006518,15.7856,30.7156,4.8058,34,0.000817,6,0.0,0.0
3349,10023,8,?,?,?,248.0118,0.014722,0.001752,7,0.000375,0.00011,0.004115,17.4656,34.2357,7.8973,20,0.001111,8,0.0,0.0


In [6]:
df2 = df.iloc[0:3340]

In [7]:
df2

Unnamed: 0,GameID,LeagueIndex,Age,HoursPerWeek,TotalHours,APM,SelectByHotkeys,AssignToHotkeys,UniqueHotkeys,MinimapAttacks,MinimapRightClicks,NumberOfPACs,GapBetweenPACs,ActionLatency,ActionsInPAC,TotalMapExplored,WorkersMade,UniqueUnitsMade,ComplexUnitsMade,ComplexAbilitiesUsed
0,52,5,27,10,3000,143.7180,0.003515,0.000220,7,0.000110,0.000392,0.004849,32.6677,40.8673,4.7508,28,0.001397,6,0.0,0.000000
1,55,5,23,10,5000,129.2322,0.003304,0.000259,4,0.000294,0.000432,0.004307,32.9194,42.3454,4.8434,22,0.001193,5,0.0,0.000208
2,56,4,30,10,200,69.9612,0.001101,0.000336,4,0.000294,0.000461,0.002926,44.6475,75.3548,4.0430,22,0.000745,6,0.0,0.000189
3,57,3,19,20,400,107.6016,0.001034,0.000213,1,0.000053,0.000543,0.003783,29.2203,53.7352,4.9155,19,0.000426,7,0.0,0.000384
4,58,3,32,10,500,122.8908,0.001136,0.000327,2,0.000000,0.001329,0.002368,22.6885,62.0813,9.3740,15,0.001174,4,0.0,0.000019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3335,9261,4,20,8,400,158.1390,0.013829,0.000504,7,0.000217,0.000313,0.003583,36.3990,66.2718,4.5097,30,0.001035,7,0.0,0.000287
3336,9264,5,16,56,1500,186.1320,0.006951,0.000360,6,0.000083,0.000166,0.005414,22.8615,34.7417,4.9309,38,0.001343,7,0.0,0.000388
3337,9265,4,21,8,100,121.6992,0.002956,0.000241,8,0.000055,0.000208,0.003690,35.5833,57.9585,5.4154,23,0.002014,7,0.0,0.000000
3338,9270,3,20,28,400,134.2848,0.005424,0.000182,5,0.000000,0.000480,0.003205,18.2927,62.4615,6.0202,18,0.000934,5,0.0,0.000000


In [None]:
df2 = df2.astype({"Age": int})
a = df2['Age'].mean()
a

In [None]:
df.loc[df.Age == "?", "Age"] = a

In [None]:
df.loc[df['HoursPerWeek'] == '?']

In [None]:
df2 = df.iloc[0:3339]

In [None]:
df2

In [None]:
df2 = df2.drop(df2.index[1841])

In [None]:
df2 = df2.astype({"HoursPerWeek": int})
a = df2['HoursPerWeek'].mean()
a

In [None]:
df.loc[df.HoursPerWeek == "?", "HoursPerWeek"] = a

In [None]:
df.loc[df['TotalHours'] == '?']

In [None]:
df2 = df.iloc[0:3339]

In [None]:
df2 = df2.drop(df2.index[358])

In [None]:
df2 = df2.drop(df2.index[1840])

In [None]:
df2 = df2.astype({"TotalHours": int})
a = df2['TotalHours'].mean()
a

In [None]:
df.loc[df.TotalHours == "?", "TotalHours"] = a

In [None]:
df.isin(['?']).sum(axis=0) #dataset cleaned

###### We managed to keep the problematic lines, and the dataset is now usable.#

In [None]:
df['Age']=df['Age'].astype(np.int64)
df['HoursPerWeek']=df['HoursPerWeek'].astype(np.int64)
df['TotalHours']=df['TotalHours'].astype(np.int64)

In [None]:
df.info()

###### The types of all the columns are good, let's start visualizing the data!#

In [None]:
sns.set(rc={'figure.figsize':(10,8)})
sns.set_theme(style="whitegrid")


In [None]:

ax = sns.countplot(x="LeagueIndex", data = df, palette="flare")
plt.title("Players in each leagues", fontsize =24)
plt.ylabel('Number of players', fontsize=18)
plt.xlabel('League Index ', fontsize=18)
for p in ax.patches:
        ax.annotate('{:d}'.format(p.get_height()), xy=(p.get_x() + 0.4, p.get_height() - 20), fontsize=15, color='#d3dae6', ha='center', va="center", weight='bold')

In [None]:

sns.set(rc={'figure.figsize':(15,9)})
fig, ax =plt.subplots(1)
sns.set_theme(style="whitegrid")
ax = sns.boxplot(x="LeagueIndex", y="Age", data=df,palette='flare')




###### Remember the values that had a question mark at the beginning? They were the lines of the professional players, in category 8.
###### Since we have replaced their values by the average of the others, it is normal that on the plot we have only one line, they all have the same value.
###### We will thus have this line of equal values for the plots of the three columns.

In [None]:
dfInterval = df.drop(df[df.LeagueIndex == 8].index)

In [None]:
#CODE PLOT AGE EN FCT LEAGUEINDEX


dfInterval.Age = pd.cut(df.Age, bins=[15, 18, 22, 25, 30, 45], 
                        include_lowest=True, precision=0, ordered = False, labels = ["15-18", "18-22", "22-25", "25-30", "30-44", ])
from matplotlib.ticker import FuncFormatter

ax = sns.kdeplot(x="LeagueIndex", hue="Age", data = dfInterval, shade=True, palette="flare", cut=0, multiple="stack")
plt.title("Age of players per league", fontsize =24)
plt.xlabel('League Index ', fontsize=18)
plt.ylabel('Number of players (%)', fontsize=18)

#FIN

###### Here is another graph, nicer.

In [None]:
fig, ax =plt.subplots(1)
sns.set_theme(style="whitegrid")
ax = sns.boxplot(x="LeagueIndex", y="HoursPerWeek", data=df,palette='flare')


###### We have our first trend!
###### It is quite trivial and logical.
###### We observe that the more hours we play per week, the better we are!
###### In competitive games like Starcraft2, being good means playing a lot. It's a very complicated game to master and requires a lot of skills. Players have to practice more and more to progress.
###### One player indicated that he plays 168 hours a week, so either this player was mistaken or he's investing a little too much in the game! (24*7 = 168 hours for a full week)
###### He is rank 6 and determined to move up!

In [None]:
fig, ax =plt.subplots(1)
sns.set_theme(style="whitegrid")
ax = sns.boxplot(x="LeagueIndex", y="TotalHours", data=df,palette='flare')


###### We detect an anomaly in column 5, let's go fix it.#

In [None]:
df.sort_values(by="TotalHours",ascending=False)

###### Line 1793, the player has indicated that he has played 1000000 hours. This value is an error, so we will remove his line for the plot.

In [None]:

df2 = df.drop(df.index[1793])


In [None]:
fig, ax =plt.subplots(1)
sns.set_theme(style="whitegrid")
ax = sns.boxplot(x="LeagueIndex", y="TotalHours", data=df2,palette='flare')

######  As this game is very time consuming, it is no surprise that the best players have the most playing time.#
######  There is a correlation between these two variables.
######  Here is another version of the plot, more realistic because less crushed.
 

In [None]:
#CODE PLOT HISTO TOTALHOURS / LEAGUEINDEX

dfInterval.TotalHours = pd.cut(dfInterval.TotalHours, bins=[0, 100,250, 500, 1000, 3000, 25000], 
                        include_lowest=True, precision=0, ordered = False, labels = ["0-100", "100-250", "250-500", "500-1000", "1000-3000", "3000-25000"])
sns.set(rc={'figure.figsize':(15,12)})



ax = sns.kdeplot(x="LeagueIndex", hue="TotalHours", data = dfInterval, shade=True, palette="flare", cut=0, multiple="fill")
plt.title("TotalHours per LeagueIndex", fontsize =24)
plt.xlabel('League Index ', fontsize=18)
plt.ylabel('Number of players (%)', fontsize=18)


#FIN

In [None]:
fig, ax =plt.subplots(1)
sns.set_theme(style="whitegrid")
ax = sns.boxplot(x="LeagueIndex", y="APM", data=df,palette='flare')

###### This one speaks for itself!
###### For those who don't know what APMs are, it stands for "Action Per Minute".
###### It is completely logical in such a game, you have to manage dozens of units at the same time, keep a perfect organization, do tons of things!
###### The correlation is important, and we can already assume that this variable will be very important for our future models.

In [None]:
fig, ax =plt.subplots(1)
sns.set_theme(style="whitegrid")
ax = sns.boxplot(x="LeagueIndex", y="SelectByHotkeys", data=df,palette='flare')

###### This variable 'Select By Hotkeys' is the number of unit or building selections made using hotkeys per timestamp (continuous).
###### This means that the player must know his shortcuts to use them wisely.
###### Unsurprisingly, a novice player will use his mouse rather than his shortcuts.
###### Using these shortcuts takes a lot of learning to be effective.
###### Not surprisingly again, this correlation is huge, there is a real gap between the professional players and the others.

In [None]:
fig, ax =plt.subplots(1)
sns.set_theme(style="whitegrid")
ax = sns.boxplot(x="LeagueIndex", y="AssignToHotkeys", data=df,palette='flare')

###### This variable 'Assign to Hotkeys' is the number of units or buildings assigned to hotkeys per timestamp (continuous).
###### In addition to the previous graph, the more shortcuts a player has assigned to various actions, the better he is.
###### Same interpretation as above, it requires knowledge and a lot of training to be able to choose the right shortcuts and to use them.
###### This correlation is once again very strong.

In [None]:
fig, ax =plt.subplots(1)
sns.set_theme(style="whitegrid")
ax = sns.boxplot(x="LeagueIndex", y="UniqueHotkeys", data=df,palette='flare')

###### This variable 'Unique Hotkeys' is the number of unique hotkeys used per timestamp (continuous).
###### Up to the platinum rank we don't have a very high correlation.
###### However, from the diamond rank onwards, the number of different shortcuts used is correlated with the rank.
###### A very good player will use between 6 and 10 (the maximum) different shortcuts, while a master player will use between 4 and 7.
###### To be the best at starcraft2, it seems to be better to be able to play with many different shortcuts, which shows a great mastery of the game.



In [None]:
fig, ax =plt.subplots(1)
sns.set_theme(style="whitegrid")
ax = sns.boxplot(x="LeagueIndex", y="MinimapAttacks", data=df,palette='flare')

###### This variable 'Minimap Attacks' is the number of attack actions on minimap per timestamp (continuous).
###### Another positively correlated variable, the best players are those who do the most attacking via the minimap. 
###### This certainly saves some time, but it is still necessary to be able to master this ability.


In [None]:
fig, ax =plt.subplots(1)
sns.set_theme(style="whitegrid")
ax = sns.boxplot(x="LeagueIndex", y="MinimapRightClicks", data=df,palette='flare')

###### The right click allows you to do many things on this game. Not surprisingly, those who are able to use the right click via minimap and do the most are the best players.
###### This reflects a certain skill and a great mastery of the game.

In [None]:
fig, ax =plt.subplots(1)
sns.set_theme(style="whitegrid")
ax = sns.boxplot(x="LeagueIndex", y="NumberOfPACs", data=df,palette='flare')

###### We went to find out what PACs were:  
###### Perception Action Cycle, this is apparently a very well-known indicator in the world of competitive video games.
###### The more PACs you have, the better you are at the game.
###### And this variable doesn't seem to be wrong! 
###### Indeed, the best players have the highest number of PACs.

In [None]:
fig, ax =plt.subplots(1)
sns.set_theme(style="whitegrid")
ax = sns.boxplot(x="LeagueIndex", y="GapBetweenPACs", data=df,palette='flare')

###### The fastest and most experienced players are the ones who can string together their PACs.
###### The beginners and the less fast players will space their PACs which will slow down their game.
###### Unsurprisingly, they will win less because they are less active and therefore less efficient than their opponents.

In [None]:
fig, ax =plt.subplots(1)
sns.set_theme(style="whitegrid")
ax = sns.boxplot(x="LeagueIndex", y="ActionLatency", data=df,palette='flare')

###### Almost the same as above, the more time a player lets pass between his actions, the worse he is.
###### To be good at Starcraft2, you have to go very fast, constantly think about your next move, and not give your opponent any time to breathe.
###### The best players are those who optimize their time the most.

In [None]:
fig, ax =plt.subplots(1)
sns.set_theme(style="whitegrid")
ax = sns.boxplot(x="LeagueIndex", y="ActionsInPAC", data=df,palette='flare')

###### Very slight correlation on this variable.
###### It is better to focus on the total number of PACs than on the number of actions in each PAC.

In [None]:
fig, ax =plt.subplots(1)
sns.set_theme(style="whitegrid")
ax = sns.boxplot(x="LeagueIndex", y="TotalMapExplored", data=df,palette='flare')

###### This variable 'Total Map Explored' is the number of 24x24 game coordinate grids viewed by the player per timestamp (continuous).
###### This variable is positively correlated with the player's rank, up to the grandmaster rank.
###### After that it seems that the best players explore the map less.
###### This may be due to the fact that professional players need to explore less before attacking. They are more efficient in their exploration and therefore have better information about the enemies.
###### Perhaps the professional level games are shorter and therefore leave less time to explore the map.

In [None]:
fig, ax =plt.subplots(1)
sns.set_theme(style="whitegrid")
ax = sns.boxplot(x="LeagueIndex", y="WorkersMade", data=df,palette='flare')


###### Variable very slightly correlated up to grandmaster.
###### Making more units makes you better at the game.
###### However, pro players do a bit less than other ranks, probably because they are more efficient with fewer units, or their games are shorter.

In [None]:
fig, ax =plt.subplots(1)
sns.set_theme(style="whitegrid")
ax = sns.boxplot(x="LeagueIndex", y="UniqueUnitsMade", data=df,palette='flare')

###### The diversity of units created in starcraft2 is a slight indicator of better performance.
###### However, the best players use a smaller variety of units. 

In [None]:
fig, ax =plt.subplots(1)
sns.set_theme(style="whitegrid")
ax = sns.boxplot(x="LeagueIndex", y="ComplexUnitsMade", data=df,palette='flare')

###### This variable is positively correlated, up to the grandmaster, the more a player uses a "complex" unit the better he is.
###### However, most pro players do not use any.
###### This type of unit should not be the most effective in competition against experienced players.

In [None]:
fig, ax =plt.subplots(1)
sns.set_theme(style="whitegrid")
ax = sns.boxplot(x="LeagueIndex", y="ComplexAbilitiesUsed", data=df,palette='flare')

###### Same hypothesis as above.
###### Using complex abilities is probably less effective at the professional level.
###### This variable is still positively correlated.

###### Most of the variables are highly correlated with the level of the players.
###### However, we notice that for some variables, the correlation only occurs up to the grandmaster rank.
###### It seems that the professional players play in a slightly different way, more efficient, faster.
###### It could be interesting to use future prediction models without professional players, to see if it improves the performance of the models.



In [None]:
#CODE PLOT HISTO LEAGUEINDEX / HOURSPERWEEK
dfInterval = df.copy()
dfInterval.HoursPerWeek = pd.cut(dfInterval.HoursPerWeek, bins=[0, 10, 20, 50, 170], 
                        include_lowest=True, precision=0, ordered = False, labels = ["0-10", "10-20", "20-50", "50-170"])

sns.set(rc={'figure.figsize':(15,12)})

ax = sns.countplot(x="LeagueIndex", hue="HoursPerWeek", data = dfInterval, palette="flare")
plt.title("Number of players by league index and HoursPerWeek", fontsize =24)
plt.ylabel('Number of players', fontsize=18)
plt.xlabel('League Index', fontsize=18)
for p in ax.patches:
        ax.annotate('{:d}'.format(p.get_height()), xy=(p.get_x() + 0.105, p.get_height() - 15), fontsize=15, color='#d3dae6', ha='center', va="center", weight='bold')

#FIN

In [None]:
y = df['LeagueIndex']
X = df[['Age','HoursPerWeek','TotalHours','APM','SelectByHotkeys','AssignToHotkeys','UniqueHotkeys','MinimapAttacks','MinimapRightClicks','NumberOfPACs','GapBetweenPACs','ActionLatency','ActionsInPAC','TotalMapExplored','WorkersMade','UniqueUnitsMade','ComplexUnitsMade','ComplexAbilitiesUsed']]
Xa = df[['HoursPerWeek','TotalHours','APM','SelectByHotkeys','AssignToHotkeys','MinimapAttacks','MinimapRightClicks','NumberOfPACs','GapBetweenPACs','ActionLatency']]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Xa, y, test_size = 0.25, random_state = 12)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf.fit(X_train, y_train);

predictions = rf.predict(X_test)
    

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score

pred = [int(round(x)) for x in predictions]
conf_mat = confusion_matrix(y_test, pred)
print(conf_mat)
print("Accuracy : ",accuracy_score(y_test, pred))