<a href="https://colab.research.google.com/github/SimonADDA/ML-Poker-Hand/blob/main/ProjetPoker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Welcome to the Poker Hand prediction project

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import sys
import warnings
warnings.filterwarnings(action='ignore')

#!pip install imbalanced-learn
import imblearn

# import required modules
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets
import random
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
# reading csv files
df =  pd.read_csv('poker_test.data',sep=',',header=None)
print(df)

# Datacleaning of our Dataset

In [None]:
df=df.rename(columns={0: "S1",1: "C1",2: "S2",3: "C2",4: "S3",5: "C3",6: "S4",7: "C4",8: "S5",9: "C5",10: "target"})

In [None]:
df.head()

In [None]:
#Difference between two cards
def add_diffs(df:pd.DataFrame):
    df['Diff1'] = df['C5'] - df['C4']
    df['Diff2'] = df['C4'] - df['C3']
    df['Diff3'] = df['C3'] - df['C2']
    df['Diff4'] = df['C2'] - df['C1']

In [None]:
add_diffs(df)

In [None]:
#New variable on type of card
df['sum_1'] = (df[['S1','S2','S3','S4','S5']]==1).sum(axis=1)
df['sum_2'] = (df[['S1','S2','S3','S4','S5']]==2).sum(axis=1)
df['sum_3'] = (df[['S1','S2','S3','S4','S5']]==3).sum(axis=1)
df['sum_4'] = (df[['S1','S2','S3','S4','S5']]==4).sum(axis=1)

#New dataframe car easier
data = [df["sum_1"], df["sum_2"],df["sum_3"],df["sum_4"]]
headers = ["sum_1", "sum_2","sum3","sum_4"]

df_card = pd.concat(data, axis=1, keys=headers)
df_card

#Take the max in variables sum
df['max_value_type_card'] = df_card.max(axis=1)

#drop variables useless
df.drop(columns=["sum_1", "sum_2","sum_3","sum_4"], inplace=True)
df

In [None]:
#New variable sum of a poker hand
df_sum=pd.DataFrame(df, columns=['C1','C2','C3','C4','C5'])
df_sum.head()

In [None]:
df_sum['sum_card'] = df_sum.sum(axis=1)
#df_mean['mean'] = df_mean.mean(axis=1)
df_sum.head()

In [None]:
#Add the target and max in the df
df['sum_card']=df_sum.sum_card
#df['mean']=df_mean.mean
df

## Model on brut data

In [None]:
def get_score(algorithme, X_train, X_test, y_train, y_test, display_graph=False, display_options=True):
    if display_options:
        print("fitting :\n"+ str(algorithme))
        #print("X_train:{} , X_test:{} ,  y_train:{} ,  y_test:{}".format(X_train.shape, X_test.shape, y_train.shape, y_test.shape))
    modele = algorithme.fit(X_train, y_train)
    score  = modele.score(X_test, y_test)
    if display_graph:
        import matplotlib.pyplot as plt
        plt.scatter(x=y_test, y=algorithme.predict(X_test)) ## Predictions against True values
    return score

In [None]:
performances = dict()

In [None]:
X = df.loc[:, df.columns != 'target']
y = df.target

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X,y,test_size=0.3,random_state=random.seed())
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

In [None]:
clf = RandomForestClassifier(n_estimators=20, max_depth=None,
    min_samples_split=2, random_state=0)
score = get_score(clf, X_train, X_test, Y_train, Y_test)
performances[clf] = score
print(score)

Standarization of variables to see if the result can be better

In [None]:
from sklearn.preprocessing import StandardScaler
scaler   = StandardScaler().fit(X_train)
X_train  = scaler.transform(X_train)
X_test   = scaler.transform(X_test)

In [None]:
pd.DataFrame(X_train).head()

In [None]:
get_score(clf, X_train, X_test, Y_train, Y_test)

In [None]:
clf = RandomForestClassifier(n_estimators=20, max_depth=None,
    min_samples_split=2, random_state=0)
score = get_score(clf, X_train, X_test, Y_train, Y_test)
performances[clf] = score
print(score)

In [None]:
algorithme = ExtraTreesClassifier()
score      = get_score(algorithme, X_train, X_test, Y_train, Y_test)
performances[algorithme] = score
print(score)

In [None]:
algorithme=KNeighborsClassifier(n_neighbors=3)
score      = get_score(algorithme, X_train, X_test, Y_train, Y_test)
performances[algorithme] = score
print(score)

In [None]:
algorithme=DecisionTreeClassifier()
score      = get_score(algorithme, X_train, X_test, Y_train, Y_test)
performances[algorithme] = score
print(score)

In [None]:
algorithme=BaggingClassifier(KNeighborsClassifier(),
                            n_estimators=10, random_state=0)
score      = get_score(algorithme, X_train, X_test, Y_train, Y_test)
performances[algorithme] = score
print(score)

In [None]:
from collections import OrderedDict
dico_ordonne = OrderedDict(performances)

import pandas as pd
df_result1 = pd.DataFrame()
df_result1["perf"] = dico_ordonne.values()
df_result1["algo"] = dico_ordonne.keys()
df_result1['nom_algo'] = df_result1.algo.apply(lambda algo: str(algo).split('(')[0])
df_result1.set_index('nom_algo', inplace=True)
df_result1

In [None]:
# create a list of the values we want to assign for each condition
values = ['H', 'S', 'D', 'C']

# create a new column and use np.select to assign values to it using our lists as arguments
#df['S1'] = np.select(conditions, values)
df['SC1']=df['S1']+10*df['C1']
df['SC2']=df['S2']+10*df['C2']
df['SC3']=df['S3']+10*df['C3']
df['SC4']=df['S4']+10*df['C4']
df['SC5']=df['S5']+10*df['C5']

df.drop(columns=['S1', 'C1','S2','C2','S3','C3','S4','C4','S5','C5'], inplace=True)
# del df['S1']
df.head()

In [None]:
df_sort=pd.DataFrame(df, columns=['SC1','SC2','SC3','SC4','SC5'])

#sort the cards
a = df_sort.values
a.sort(axis=1)  # no ascending argument

#New df with value sorted
df_sort=pd.DataFrame(a, df_sort.index, df_sort.columns)

#Add the target,max and sum_card in the df
df_sort['target']=df.target
df_sort['max_value_type_card']=df.max_value_type_card
df_sort['sum']=df.sum_card

df_sort['Diff1']=df.Diff1
df_sort['Diff2']=df.Diff2
df_sort['Diff3']=df.Diff3
df_sort['Diff4']=df.Diff4

df=df_sort

In [None]:
df

## EDA

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
df.shape

In [None]:
# Nb de valeurs uniques par colonnes
valforcols = df.nunique()
valforcols

In [None]:
#See the duplicated
df=df.drop_duplicates()
df.shape

In [None]:
sns.heatmap(df.isna())

In [None]:
_fig = df.hist()

In [None]:
df['target'].describe()
sns.distplot(df['target'])
#skewness and kurtosis
print("Skewness: %f" % df['target'].skew())
print("Kurtosis: %f" % df['target'].kurt())

In [None]:
#See correlation
plt.figure(figsize=(10,8))
sns.heatmap(df.corr("pearson"),
            vmin=-1, vmax=1,
            cmap='coolwarm',
            annot=True, 
            square=True);

## Model on cleaning data

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X,y,test_size=0.3,random_state=random.seed())
print(X_train.shape)
print(X_test.shape)

In [None]:
performances = dict()

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X,y,test_size=0.3,random_state=random.seed())
print(X_train.shape)
print(X_test.shape)

In [None]:
hyperparametres = {"n_estimators"  :  30, "max_features"  :  3, "max_depth"     :  50,}
clf = RandomForestClassifier(**hyperparametres)
score = get_score(clf, X_train, X_test, Y_train, Y_test)
performances[clf] = score
print(score)

In [None]:
algorithme = ExtraTreesClassifier(n_estimators=20, max_depth=None,
    min_samples_split=2, random_state=0)
score      = get_score(algorithme, X_train, X_test, Y_train, Y_test)
performances[algorithme] = score
print(score)

In [None]:
algorithme=KNeighborsClassifier(n_neighbors=3)
score      = get_score(algorithme, X_train, X_test, Y_train, Y_test)
performances[algorithme] = score
print(score)

In [None]:
algorithme=DecisionTreeClassifier(criterion='entropy')
score      = get_score(algorithme, X_train, X_test, Y_train, Y_test)
performances[algorithme] = score
print(score)

In [None]:
algorithme=BaggingClassifier(KNeighborsClassifier(),
                            n_estimators=10, random_state=0)
score      = get_score(algorithme, X_train, X_test, Y_train, Y_test)
performances[algorithme] = score
print(score)

In [None]:
from collections import OrderedDict
dico_ordonne = OrderedDict(performances)

import pandas as pd
df_result2 = pd.DataFrame()
df_result2["perf"] = dico_ordonne.values()
df_result2["algo"] = dico_ordonne.keys()
df_result2['nom_algo'] = df_result2.algo.apply(lambda algo: str(algo).split('(')[0])
df_result2.set_index('nom_algo', inplace=True)
df_result2

We have an imbalanced distribution for the target so we balance it with SMOTE

In [None]:
print('Original dataset shape %s' % Counter(y))

ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(X, y)
print('Resampled dataset shape %s' % Counter(y_res))

In [None]:
X_res = pd.DataFrame(X_res)
y_res = pd.DataFrame(y_res)
y_res.iloc[:, 0].value_counts()

In [None]:
X=X_res
y=y_res

## Model on New *data*

In [None]:
performances = dict()

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X,y,test_size=0.3,random_state=random.seed())
print(X_train.shape)
print(X_test.shape)

In [None]:
clf = RandomForestClassifier(n_estimators=20, max_depth=None,
    min_samples_split=2, random_state=0)
score = get_score(clf, X_train, X_test, Y_train, Y_test)
performances[clf] = score
print(score)

In [None]:
algorithme = ExtraTreesClassifier(n_estimators=20, max_depth=None,
    min_samples_split=2, random_state=0)
score      = get_score(algorithme, X_train, X_test, Y_train, Y_test)
performances[algorithme] = score
print(score)

In [None]:
algorithme=KNeighborsClassifier(n_neighbors=5)
score      = get_score(algorithme, X_train, X_test, Y_train, Y_test)
print(score)

In [None]:
#Convert dataframe type to Numpy type
X_train=X_train.to_numpy()
Y_train=Y_train.to_numpy()

#Find the best K
from sklearn.model_selection import KFold
kf=KFold(n_splits=3, shuffle=True) # partages de validation

from sklearn import neighbors
scores=[]
for k in range(1,6):  # les différentes valeurs de k à tester
    score=0
    clf=neighbors.KNeighborsClassifier(k)
    for learn,test in kf.split(X_train): # boucle sur différents partages de validation
        X_app=X_train[learn]
        Y_app=Y_train[learn]
        clf.fit(X_app,Y_app)
        X_val=X_train[test]
        Y_val=Y_train[test]
        score+=clf.score(X_val,Y_val)
    scores.append(score)
print(scores)
#plt(scores)
k_opt=scores.index(max(scores)) + 1  # valeur optimale de k
print(k_opt)

In [None]:
algorithme=KNeighborsClassifier(n_neighbors=3)
score      = get_score(algorithme, X_train, X_test, Y_train, Y_test)
performances[algorithme] = score
print(score)

In [None]:
algorithme=DecisionTreeClassifier(criterion='entropy')
score      = get_score(algorithme, X_train, X_test, Y_train, Y_test)
performances[algorithme] = score
print(score)

In [None]:
algorithme=BaggingClassifier(KNeighborsClassifier(),
                            n_estimators=10, random_state=0)
score      = get_score(algorithme, X_train, X_test, Y_train, Y_test)
performances[algorithme] = score
print(score)

In [None]:
algorithme = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                        max_depth=1, random_state=0)
score      = get_score(algorithme, X_train, X_test, Y_train, Y_test)
performances[algorithme] = score
print(score)

In [None]:
from collections import OrderedDict
dico_ordonne = OrderedDict(performances)

import pandas as pd
df_result3= pd.DataFrame()
df_result3["perf"] = dico_ordonne.values()
df_result3["algo"] = dico_ordonne.keys()
df_result3['nom_algo'] = df_result3.algo.apply(lambda algo: str(algo).split('(')[0])
df_result3.set_index('nom_algo', inplace=True)
df_result3

In [None]:
df_result3[["perf"]].plot(kind='line', rot=60)

## Label encoder

In [None]:
df.head()

In [None]:
# Generate binary values using get_dummies
#df = pd.get_dummies(df, columns=['SC1','SC2','SC3','SC4','SC5'])
#df.drop(columns=['SC1_134','SC2_134','SC3_134','SC4_134','SC5_134'], inplace=True)
df.head()

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X,y,test_size=0.3,random_state=random.seed())
print(X_train.shape)
print(X_test.shape)

In [None]:
clf = RandomForestClassifier(n_estimators=20, max_depth=None,
    min_samples_split=2, random_state=0)
score = get_score(clf, X_train, X_test, Y_train, Y_test)
performances[clf] = score
print(score)

In [None]:
#define total sample size desired
#N = 50000
#perform stratified random sampling
#df=df.groupby('target', group_keys=False).apply(lambda x: x.sample(int(np.rint(N*len(x)/len(df))))).sample(frac=1).reset_index(drop=True)

### Cross Validation on our best model : Random forest

In [None]:
def multiple_cross_val_scores(algorithme, X, y):
    import numpy as np
    results=dict()
    for kfold in range(3,15,5):
        score = cross_val_score(algorithme, X, y,  cv = KFold(shuffle=True, n_splits=kfold), scoring='r2')
        results[kfold] = score.mean(), score.std()
    return results

In [None]:
#Cross val
#test = multiple_cross_val_scores(RandomForestClassifier(),X, y)
#test = pd.DataFrame(test, index=["mean", "std"]).T
#test

In [None]:
#new_index = [str(x) + " folds" for x in test.index]
#test.index = new_index
#test.plot(kind='bar', title='Cross-validation using all data with {} lignes'.format(X.shape[0]))

### Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
rfc=RandomForestClassifier(random_state=42)

param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt'],
    'max_depth' : [5, 10],
    'criterion' :['gini', 'entropy']
}
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
#CV_rfc.fit(X_train, Y_train)

In [None]:
#CV_rfc.best_params_

In [None]:
#Before
clf = RandomForestClassifier(n_estimators=20, max_depth=None,
    min_samples_split=2, random_state=0)
score = get_score(clf, X_train, X_test, Y_train, Y_test)
performances[clf] = score
print(score)

In [None]:
#After
hyperparametres = {'criterion': 'entropy',
 'max_features': 'auto',
 'n_estimators': 500,
 'max_depth':None,
 'random_state':0
  }
clf = RandomForestClassifier(**hyperparametres)
score = get_score(clf, X_train, X_test, Y_train, Y_test)
performances[clf] = score
print(score)

In [None]:
#results = cross_val_score(clf, X, y, cv=KFold(shuffle=True, n_splits=5))
#display(results, results.mean(), results.std())

In [None]:
# evaluate decision tree performance on train and test sets with different tree depths
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from matplotlib import pyplot

# define the tree depths to evaluate
values = [i for i in range(5,20)]

# define lists to collect scores
train_scores, test_scores = list(), list()

# evaluate a knn tree for each neighbors
for i in values:
	# configure the model
	model = KNeighborsClassifier(n_neighbors=i)
  	# fit model on the training dataset
	model.fit(X_train, Y_train)
	# evaluate on the train dataset
	train_yhat = model.predict(X_train)
	train_acc = accuracy_score(Y_train, train_yhat)
	train_scores.append(train_acc)
	# evaluate on the test dataset
	test_yhat = model.predict(X_test)
	test_acc = accuracy_score(Y_test, test_yhat)
	test_scores.append(test_acc)
	# summarize progress
	print('>%d, train: %.3f, test: %.3f' % (i, train_acc, test_acc))
# plot of train and test scores vs tree depth
pyplot.plot(values, train_scores, '-o', label='Train')
pyplot.plot(values, test_scores, '-o', label='Test')
pyplot.legend()
pyplot.show()

Our model of RF is overfittin  but not our model of KNN. We will use a gridsearch.

In [None]:
from sklearn.model_selection import GridSearchCV
rfc=KNeighborsClassifier()

param_grid = { 
    'leaf_size':[30,50],
    'p':[2,3]
}
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
#CV_rfc.fit(X_train, Y_train)

In [None]:
#CV_rfc.best_params_

In [None]:
#After
hyperparametres = {'leaf_size': 50, 'p': 2,'n_neighbors':3}
algorithme=KNeighborsClassifier(**hyperparametres)
score      = get_score(algorithme, X_train, X_test, Y_train, Y_test)
performances[algorithme] = score
print(score)

### Feature importance with Decision tree

In [None]:
algorithme=DecisionTreeClassifier(criterion='entropy')
score      = get_score(algorithme, X_train, X_test, Y_train, Y_test)
performances[algorithme] = score
print(score)

In [None]:
y_pred = algorithme.predict(X_test)

In [None]:
feature_imp = pd.DataFrame(sorted(zip(X_train.columns, algorithme.feature_importances_), key=lambda k: k[1], reverse=True))
feature_imp.columns = ['Feature', 'Importance']
f, ax = plt.subplots(figsize=(10, 7))
# ax.set(yscale="log")
plt.xticks(rotation=45)
sns.barplot(data=feature_imp, x='Feature', y='Importance')