In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.metrics as metrics

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV

from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

from sklearn.linear_model import LogisticRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA

In [None]:
df = pd.read_csv('data/concat_dfs.csv')
adj_df = pd.read_csv('data/adjusted_class.csv')

In [None]:
df.head()

In [None]:
#What do I really want out of this?

#Supervised
#Predict global account survival rates --> needs test data
#Predict character class survival rates --> needs test data, consider ascended vs base?
#Predict character performance --> needs test data

#Unsupervised
#Group account performance (are there accounts who consistently achieve high performance (level) across leagues? Low performance?)
#Group account survival (are there accounts with multiple characters w/o deaths?)

In [None]:
df =pd.get_dummies(df, columns=['Class', 'Account'])

In [None]:
col_names = []
for col in df.columns:
    if 'Class' in col:
        col_names.append(col)
#    if 'Account' in col:
#        col_names.append(col)

features = col_names + ['Level']
X = df[features]
y = df['Dead']

In [None]:
y.value_counts(normalize=True)

In [None]:
ss = StandardScaler()
X_scaled = ss.fit_transform(X)

In [None]:
#Thanks to Tori and group project 4
def modelfunc(X, y):
    pipelines = [
        ('LOGISTIC REGRESSION', (Pipeline([ ('LG', LogisticRegression(max_iter=100))]))),
        ('DECISION TREE', (Pipeline([ ('TREE', DecisionTreeClassifier())]))),
        ('BAGGED TREE', (Pipeline([ ('BAG', BaggingClassifier())]))),
        ('RANDOM FOREST', (Pipeline([ ('RAND', RandomForestClassifier())]))),
        ('ADABOOST', (Pipeline([ ('ADA', AdaBoostClassifier())]))),
        ('KNN', (Pipeline([ ('sc', StandardScaler()),('KNN', KNeighborsClassifier())])))
    ]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42) 
    for pipe_name ,model in pipelines:
        print(pipe_name)
        model.fit(X_train, y_train)
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
        trainscore = model.score(X_train, y_train)
        testscore = model.score(X_test, y_test)
        crossval = cross_val_score(model, X_train, y_train).mean()
        acc_tr= metrics.accuracy_score(y_train, y_pred_train)
        acc_te = metrics.accuracy_score(y_test, y_pred_test)
        print (f'Model = {model}')
        print (f'Train Score = {trainscore}')
        print (f'Test Score = {testscore}')
        print (f'Cross Val Score = {crossval}')
        print (f'Accuracy Train = {acc_tr}')
        print (f'Accuracy Test = {acc_te}')
        print('')
        print('')

In [None]:
modelfunc(X, y)

# X= Class, Account, Level

y = Dead

LOGISTIC REGRESSION
Model = Pipeline(steps=[('LG', LogisticRegression())])
Train Score = 0.7574166666666666
Test Score = 0.66825
Cross Val Score = 0.6825000000000001
Accuracy Train = 0.7574166666666666
Accuracy Test = 0.66825


DECISION TREE
Model = Pipeline(steps=[('TREE', DecisionTreeClassifier())])
Train Score = 0.994
Test Score = 0.70675
Cross Val Score = 0.72275
Accuracy Train = 0.994
Accuracy Test = 0.70675


BAGGED TREE
Model = Pipeline(steps=[('BAG', BaggingClassifier())])
Train Score = 0.96525
Test Score = 0.70725
Cross Val Score = 0.7252500000000001
Accuracy Train = 0.96525
Accuracy Test = 0.70725


RANDOM FOREST
Model = Pipeline(steps=[('RAND', RandomForestClassifier())])
Train Score = 0.994
Test Score = 0.70725
Cross Val Score = 0.7255833333333334
Accuracy Train = 0.994
Accuracy Test = 0.70725


ADABOOST
Model = Pipeline(steps=[('ADA', AdaBoostClassifier())])
Train Score = 0.73225
Test Score = 0.712
Cross Val Score = 0.7267499999999999
Accuracy Train = 0.73225
Accuracy Test = 0.712


KNN
Model = Pipeline(steps=[('sc', StandardScaler()), ('KNN', KNeighborsClassifier())])
Train Score = 0.7559166666666667
Test Score = 0.65875
Cross Val Score = 0.6648333333333334
Accuracy Train = 0.7559166666666667
Accuracy Test = 0.65875

# X = Level

LOGISTIC REGRESSION
Model = Pipeline(steps=[('LG', LogisticRegression())])
Train Score = 0.6325
Test Score = 0.62725
Cross Val Score = 0.6325
Accuracy Train = 0.6325
Accuracy Test = 0.62725


DECISION TREE
Model = Pipeline(steps=[('TREE', DecisionTreeClassifier())])
Train Score = 0.7315833333333334
Test Score = 0.7175
Cross Val Score = 0.7279166666666667
Accuracy Train = 0.7315833333333334
Accuracy Test = 0.7175


BAGGED TREE
Model = Pipeline(steps=[('BAG', BaggingClassifier())])
Train Score = 0.73125
Test Score = 0.7155
Cross Val Score = 0.7285833333333334
Accuracy Train = 0.73125
Accuracy Test = 0.7155


RANDOM FOREST
Model = Pipeline(steps=[('RAND', RandomForestClassifier())])
Train Score = 0.7315833333333334
Test Score = 0.7175
Cross Val Score = 0.7283333333333333
Accuracy Train = 0.7315833333333334
Accuracy Test = 0.7175


ADABOOST
Model = Pipeline(steps=[('ADA', AdaBoostClassifier())])
Train Score = 0.7275833333333334
Test Score = 0.7145
Cross Val Score = 0.7260833333333333
Accuracy Train = 0.7275833333333334
Accuracy Test = 0.7145


KNN
Model = Pipeline(steps=[('sc', StandardScaler()), ('KNN', KNeighborsClassifier())])
Train Score = 0.6881666666666667
Test Score = 0.66975
Cross Val Score = 0.6848333333333334
Accuracy Train = 0.6881666666666667
Accuracy Test = 0.66975