# Model Selection

- Split data

- Select models

- Train models

In [22]:
import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings('always')



#models to run
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn import preprocessing
from sklearn import neighbors
from sklearn.preprocessing import LabelBinarizer, SplineTransformer, PolynomialFeatures

#train_test_split
from sklearn.model_selection import train_test_split,StratifiedShuffleSplit, GridSearchCV

#for cycle
from itertools import cycle

#metrics
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, f1_score, r2_score, accuracy_score, roc_auc_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score, confusion_matrix

RANDOM_STATE = 42
TEST_SIZE = 0.2


In [23]:
URL = "../notebooks/features_selected.csv"
games_original = pd.read_csv(URL, encoding='UTF-8')
games = games_original

In [24]:
game_owners = games["owners"]
games.drop(["owners", "name", "appid", "release_date"], axis=1, inplace=True)
games.dropna()

Unnamed: 0,english,required_age,average_playtime,median_playtime,price,dev-,dev- (Miwashiba),dev-4AM Games,dev-7 Soft Pillows,dev-@CarlosGameDev,...,tag-Story Rich,tag-Survival,tag-Trains,tag-Turn-Based Strategy,tag-Twin Stick Shooter,tag-Utilities,tag-VR,tag-Violent,tag-Warhammer 40K,tag-Zombies
0,1,0,23944,801,0.00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1744,3386,24.99,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,18,614,519,19.99,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,222,360,1.99,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,889,889,9.99,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8581,1,0,278,288,0.79,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
8582,1,0,0,0,2.69,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8583,1,0,0,0,6.19,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8584,1,0,0,0,14.99,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
X_train, X_test, y_train, y_test = train_test_split(games, game_owners, test_size=TEST_SIZE, random_state=RANDOM_STATE)


In [26]:
def model_fit(model, X_train, y_train):
    """Fit classification model using sklearn library. Returns: model predictions and probabilities"""
    model.fit(X_train, y_train)

def model_predict(model, X_test):
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)
    return y_pred, y_pred_proba

def get_performance_metrics(y_test, model_predictions, model_predictions_probability):
    

    """Calculate accuracy, precision, recall, f1-score, and kappa score. Returns: Dictionary of parameters""" 
    model_accuracy = sum(y_test == model_predictions) / len(y_test)
    model_precision = precision_score(y_test,model_predictions, average='weighted')
    model_recall = recall_score(y_test, model_predictions, average='weighted')
    model_f1 = f1_score(y_test,model_predictions, average='weighted')
    model_kappa = cohen_kappa_score(y_test,model_predictions)

    # Confusion matrix
    model_confusion_matrix = confusion_matrix(y_test,model_predictions)

    # Return as dictionary
    return {'Model_Accuracy': model_accuracy, 'Model_Precision': model_precision, 'Model_Recall': model_recall, 'Model_F1_Score': model_f1, \
         'Model_Kappa': model_kappa, 'Confusion_Matrix': model_confusion_matrix}

In [27]:
X_train.head()

Unnamed: 0,english,required_age,average_playtime,median_playtime,price,dev-,dev- (Miwashiba),dev-4AM Games,dev-7 Soft Pillows,dev-@CarlosGameDev,...,tag-Story Rich,tag-Survival,tag-Trains,tag-Turn-Based Strategy,tag-Twin Stick Shooter,tag-Utilities,tag-VR,tag-Violent,tag-Warhammer 40K,tag-Zombies
5121,1,16,1112,1416,15.99,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
58,1,0,220,263,7.99,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2301,1,0,0,0,14.99,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6808,1,0,0,0,5.59,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5113,1,0,0,0,3.99,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
#a)
fit_lda = LinearDiscriminantAnalysis()
model_fit(fit_lda, X_train, y_train)
lda_pred, lda_pred_proba = model_predict(fit_lda, X_test)

#lda_eval = get_performance_metrics(y_test, lda_pred, lda_pred_proba)


accuracy = accuracy_score(y_test, lda_pred)

print(accuracy)

0.420256111757858


In [29]:
#b) 
'''
fit_qda = QuadraticDiscriminantAnalysis()
qda_pred, qda_pred_proba = model_fit(fit_qda, X_train, y_train, X_test)


accuracy = accuracy_score(y_test, qda_pred)

print(accuracy)
''' 

'\nfit_qda = QuadraticDiscriminantAnalysis()\nqda_pred, qda_pred_proba = model_fit(fit_qda, X_train, y_train, X_test)\n\n\naccuracy = accuracy_score(y_test, qda_pred)\n\nprint(accuracy)\n'

In [30]:
#c)
fit_logit = LogisticRegression(random_state=42)
model_fit(fit_logit, X_train, y_train)
logit_pred, logit_pred_proba = model_predict(fit_logit, X_test)


accuracy = accuracy_score(y_test, logit_pred)

print(accuracy)

0.4918509895227008


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [31]:
fit_gnb = GaussianNB()
model_fit(fit_gnb, X_train, y_train)
gnb_pred, gnb_pred_proba = model_predict(fit_logit, X_test)

accuracy = accuracy_score(y_test, gnb_pred)
print(accuracy)

0.4918509895227008


In [32]:
#e)
fit_knn = neighbors.KNeighborsClassifier()
model_fit(fit_knn, X_train, y_train)
knn_pred, knn_pred_proba = model_predict(fit_knn, X_test)

accuracy = accuracy_score(y_test, knn_pred)

print(accuracy)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


0.490104772991851


In [33]:
#f) 
fit_dt = DecisionTreeClassifier()
model_fit(fit_dt, X_train, y_train)
dt_pred, dt_pred_proba = model_predict(fit_dt, X_test)

accuracy = accuracy_score(y_test, dt_pred)
print(accuracy)

TypeError: model_fit() takes 3 positional arguments but 4 were given

In [None]:
class RandomForest:
    def __init__(self, X, y, random_state=RANDOM_STATE):
        """Initializes the RandomForest class with the input features X and target variable y,
        and the random state used for reproducibility.

        Parameters:
        -----------
        X : pandas.DataFrame
            The input feature matrix of shape (n_samples, n_features).
        y : pandas.Series
            The target variable of shape (n_samples,).
        random_state : int, default=RANDOM_STATE, which is 42
            The seed value for random number generator used to split the data.

        Returns:
        --------
        None"""

        self.X = X
        self.y = y
        self.random_state = random_state
        self.model = RandomForestClassifier()
        self.best_params = {} 
        self.best_score = 0 
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=random_state)
    
    def fit(self, tune_fit="yes"):
        """Trains the random forest model using the input data X and y.
        If tune_fit is set to "yes", it tunes the hyperparameters using GridSearchCV(), otherwise, with default parameters.
        It then stores the best hyperparameters and best estimator in the attributes best_params and model, respectively.

        Parameters:
        -----------
        tune_fit : str, default="yes"
            If "yes", tune the hyperparameters using GridSearchCV(), otherwise use default parameters.

        Returns:
        --------
        None
        """

        if tune_fit=="yes":
            param_grid = {'max_depth': [2, 3, 5, 7, 8, 9, 10], 'max_leaf_nodes': [5, 10, 15, 25, 30, 35, 40, 50], 'min_samples_split': [2], 'min_samples_leaf': [2,3]}
            grid_search = GridSearchCV(estimator=self.model, param_grid=param_grid, refit=True, cv=3)
            grid_search.fit(self.X_train, self.y_train)
            self.best_params = grid_search.best_params_
            self.model = grid_search.best_estimator_
        else: 
            self.model.fit(self.X_train, self.y_train)
            
    def predict(self):
        """Predicts the target variable of the test data using the trained random forest model.
        Returns the predicted target variable values.

        Parameters:
        -----------
        None

        Returns:
        --------
        numpy.ndarray: The predicted target variable values
        
        """

        y_pred = self.model.predict(self.X_test)
        return y_pred
    
    def score(self):
        """Computes the accuracy score and classification report for the predicted target variable and the actual test target variable.

        Parameters:
        -----------
        None

        Returns:
        --------
        accuracy score: The ratio of the correctly predicted observations to the total observations.
        
        classification report: A text report of the main classification metrics such as precision, recall, f1-score and support for each class."""
        
        y_pred = self.predict()
        accuracy = accuracy_score(self.y_test, y_pred)
        report = classification_report(self.y_test, y_pred)

        return accuracy, report

In [None]:
fit_rf = RandomForest(X_train, y_train, RANDOM_STATE)
fit_rf.fit("yes")
pred_rf = fit_rf.predict()
accuracy_rf, classification_report_rf = fit_rf.score()
print("The accuracy achieved by random forest model:", accuracy_rf)



The accuracy achieved by random forest model: 0.5473071324599709


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
class KNN:
    def __init__(self, X, y, n_neighbors=5, test_size=0.2, random_state=42):
        """
        Initialize the KNN classifier.

        Parameters:
        -----------
            X (pandas.DataFrame): The feature matrix of shape (n_samples, n_features).
            y (pandas.Series): The target vector of shape (n_samples,).
            n_neighbors (int, optional): The number of nearest neighbors to use in classification. Defaults to 5.
            test_size (float, optional): The proportion of samples to use for testing. Defaults to 0.2.
            random_state (int, optional): The random state to use for splitting the data. Defaults to 42.
        """
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
        self.model = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors)
        self.best_params = {}
        self.best_score = 0
        
    def fit(self, tune_fit="no"):
        """
        Fit the KNN model to the training data.

        Parameters:
        -----------
            tune_fit (str, optional): Whether to perform hyperparameter tuning. If "yes", performs a grid search to find
                the best hyperparameters. Defaults to "no".

        Raises:
        -----------
            ValueError: If `tune_fit` is not "yes" or "no".

        Returns:
        -----------
            None
        """
        if tune_fit=="yes": 
            self.model.fit(self.X_train, self.y_train)

            param_grid = {
                'n_neighbors': [3, 5, 7, 10, 20, 40, 50, 70], 'metric': ['minkowski','euclidean','manhattan']
            }
            grid_search = GridSearchCV(estimator=self.model, param_grid=param_grid, refit=True, cv=3)
            grid_search.fit(self.X_train, self.y_train)

            self.best_params = grid_search.best_params_
            self.model = grid_search.best_estimator_
            self.best_score = grid_search.best_score_

            self.model.fit(self.X_train, self.y_train)
        elif tune_fit=="no":
            self.model.fit(self.X_train, self.y_train)
        else:
            raise ValueError("Invalid value for `tune_fit`. Must be either 'yes' or 'no'.")
        
    def predict(self):
        """
        Predict the target values for the test data.

        Parameters:
        -----------
        None

        Returns:
        -----------
            numpy.ndarray: The predicted target values of shape (n_samples,)
        """
        y_pred = self.model.predict(self.X_test)
        return y_pred
    
    def score(self):
        """
        Calculate the accuracy score and classification report for the KNN model.

        Parameters:
        -----------
        None

        Returns:
        -----------
        accuracy score: The ratio of the correctly predicted observations to the total observations.
        
        classification report: A text report of the main classification metrics such as precision, recall, f1-score and support for each class.
        """
        
        y_pred = self.predict()
        accuracy = accuracy_score(self.y_test, y_pred)
        #model_precision = precision_score(self.y_test, y_pred, pos_label='thriller', average='binary')
        #model_recall = recall_score(self.y_test, y_pred, pos_label='thriller', average='binary')
        #model_f1 = f1_score(self.y_test, y_pred, pos_label='thriller', average='binary')
        #report = "Accuracy score: {a} \n Precision score: {p} \n Recall score: {r} \n f1 score: {f1_score}"\
        #.format(a=accuracy ,p =model_precision, r= model_recall, f1_score= model_f1)
        report = classification_report(self.y_test, y_pred)

        return accuracy, report

In [None]:
fit_knn = KNN(X_train, y_train, 5, 0.2, RANDOM_STATE)
fit_knn.fit("yes")
pred_knn = fit_knn.predict()
accuracy_knn, classification_report_knn = fit_knn.score()
print("The accuracy achieved by knn model:", accuracy_knn) 

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mo

KeyboardInterrupt: 