# Initial Modelling notebook

In [1]:
import os
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import warnings

In [3]:
import bay12_solution_eposts as solution

## Load data

In [4]:
post, thread = solution.prepare.load_dfs('train')

In [5]:
post.head(2)

Unnamed: 0,thread_num,user,text,quotes
0,45016,Mephansteras,"Basically, this is where we talk about what ga...",[]
1,45016,dakarian,The currently running or about to run games (i...,[]


In [6]:
thread = thread.set_index('thread_num')
thread.head(2)

Unnamed: 0_level_0,thread_name,thread_label,thread_replies,thread_label_id
thread_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
45016,Games Threshold Discussion and List [Vote for ...,other,5703,8
88720,New Player's Guide to the Subforum - New to Ma...,other,961,8


In [7]:
label_map = solution.prepare.load_label_map()
label_map

type_name
bastard             0
beginners-mafia     1
byor                2
classic             3
closed-setup        4
cybrid              5
kotm                6
non-mafia-game      7
other               8
paranormal          9
supernatural       10
vanilla            11
vengeful           12
Name: type_id, dtype: int64

## Create both labels

In [8]:
label_lvl1 = pd.Series(1, index=[label_map[k] for k in label_map.index])
label_lvl1[label_map['other']] = 0
thread['label1'] = thread['thread_label_id'].map(label_lvl1)
thread['label2'] = thread['thread_label_id']

In [9]:
thread.sample(5)

Unnamed: 0_level_0,thread_name,thread_label,thread_replies,thread_label_id,label1,label2
thread_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
54229,Vote Mafia: Special Edition - Game Over: Mafia...,closed-setup,253,4,1,4
66479,Beginner's Mafia XVI [Done],beginners-mafia,253,1,1,1
40671,Paranormal Mafia - Round 9 - Game Over,paranormal,769,9,1,9
51708,"The Attendance Ranking Board, Quarter 2, 2010",other,123,8,0,8
44802,BYOR Mafia 2 (Game Over: Town Wins!),byor,742,2,1,2


## Create features from thread dataframe

We will fit a CountVectorizer, which is a simple transformation that counts the number of times the word was found.

The parameter `min_df` sets the minimum number of occurances in our set that will allow a word to join our vocabulary.

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1, 1), min_df=3)
word_vectors_raw = cv.fit_transform(thread['thread_name'])
word_df = pd.DataFrame(word_vectors_raw.toarray(), columns=cv.get_feature_names(), index=thread.index)

In [11]:
X = pd.concat([
        (thread['thread_replies'] + 1).rename('posts'), 
        np.log(thread['thread_replies'] + 1).rename('log_posts'), 
        word_df,
    ], axis='columns')
X.head()

Unnamed: 0_level_0,posts,log_posts,10,12,13,14,15,18,19,alien,...,why,win,wins,winter,with,wizard,world,you,your,zombie
thread_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45016,5704,8.648923,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
88720,962,6.869014,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39338,80,4.382027,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34959,1720,7.45008,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
64229,308,5.7301,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
y = thread['thread_label_id']
y1 = thread['label1']

## Split dataset into "training" and "validation"

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
# NOTE: setting the `random_state` lets you get the same results with the pseudo-random generator
validation_pct = 0.25
X_train, X_val, y_train, y_val, y1_train, y1_val = train_test_split(X, y, y1, test_size=validation_pct, random_state=99)

In [15]:
X1_train = X_train
X1_val = X_val

idx2 = y1_train[y1_train == 1].dropna().index
X2_train = X_train.reindex(idx2)
y2_train = y_train.reindex(idx2)

y2_val = y_val

In [16]:
X_train.shape, y1_train.shape, y2_train.shape

((268, 154), (268,), (122,))

In [17]:
X_val.shape, y1_val.shape, y2_val.shape

((90, 154), (90,), (90,))

## Fit models

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

### Fit, in-sample predict

In [19]:
# Just using default parameters... what can do wrong?
cls1 = LogisticRegression(random_state=1337)
cls1.fit(X1_train, y1_train)

y1_train_pred = pd.Series(
    cls1.predict(X1_train), 
    index=X1_train.index, 
)



In [20]:
# Just using default parameters... what can do wrong?
#cls2 = LogisticRegression(random_state=1337)
cls2 = DecisionTreeClassifier(random_state=1337)
cls2.fit(X2_train, y2_train)

y2_train_pred = pd.Series(
    cls2.predict(X2_train), 
    index=X2_train.index, 
)

In [21]:
print("(TRAIN) Job 1 accuracy:", accuracy_score(y1_train, y1_train_pred))
print(confusion_matrix(y1_train, y1_train_pred))

(TRAIN) Job 1 accuracy: 0.9701492537313433
[[144   2]
 [  6 116]]


In [22]:
print("(TRAIN) Job 2 accuracy:", accuracy_score(y2_train, y2_train_pred))
print(confusion_matrix(y2_train, y2_train_pred))

(TRAIN) Job 2 accuracy: 1.0
[[ 9  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 19  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 11  0  0  0  0  0  0  0  0  0]
 [ 0  0  0 13  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 29  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  2  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  1  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  2  0  0  0  0]
 [ 0  0  0  0  0  0  0  0 17  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  4  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  9  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  6]]


### Validation predict

In [23]:
y1_val_pred = pd.Series(
    cls1.predict(X_val), 
    index=X_val.index, 
)

In [24]:
z = (y1_val_pred==1)
y2_val_pred = pd.Series(label_map['other'], index=X_val.index)
y2_val_pred[z] = cls2.predict(X_val[z])

In [25]:
pd.concat([y1_val_pred, y2_val_pred], axis='columns').head()

Unnamed: 0_level_0,0,1
thread_num,Unnamed: 1_level_1,Unnamed: 2_level_1
47778,1,3
47530,1,4
45499,0,8
82626,1,1
86473,0,8


In [26]:
print("(VALID) Job 1 accuracy:", accuracy_score(y1_val, y1_val_pred))
print(confusion_matrix(y1_val, y1_val_pred))

(VALID) Job 1 accuracy: 0.8888888888888888
[[51  4]
 [ 6 29]]


In [27]:
print("(VALID) Job 2 accuracy:", accuracy_score(y2_val, y2_val_pred))
cm2 = confusion_matrix(y2_val, y2_val_pred)
print(cm2)

(VALID) Job 2 accuracy: 0.7222222222222222
[[ 0  0  0  2  1  0  0  1  1  0  0  0]
 [ 0  3  0  0  0  0  0  0  1  0  0  0]
 [ 0  0  0  1  0  0  0  0  1  0  0  0]
 [ 1  0  0  2  4  0  0  0  1  0  0  0]
 [ 0  0  0  2  3  0  0  1  1  0  0  0]
 [ 0  0  0  0  0  1  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  1  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  1  0  1  0  0  0  0 51  2  0  0]
 [ 0  0  0  0  0  0  0  0  0  3  0  0]
 [ 0  0  0  0  1  0  0  0  0  0  0  0]
 [ 0  1  0  0  0  0  0  0  0  0  0  2]]


In [28]:
print(classification_report(y2_val, y2_val_pred, label_map.values, target_names=label_map.index))

                 precision    recall  f1-score   support

        bastard       0.00      0.00      0.00         5
beginners-mafia       0.60      0.75      0.67         4
           byor       0.00      0.00      0.00         2
        classic       0.25      0.25      0.25         8
   closed-setup       0.33      0.43      0.38         7
         cybrid       1.00      1.00      1.00         1
           kotm       0.00      0.00      0.00         1
 non-mafia-game       0.00      0.00      0.00         0
          other       0.89      0.93      0.91        55
     paranormal       0.60      1.00      0.75         3
   supernatural       0.00      0.00      0.00         0
        vanilla       0.00      0.00      0.00         1
       vengeful       1.00      0.67      0.80         3

      micro avg       0.72      0.72      0.72        90
      macro avg       0.36      0.39      0.37        90
   weighted avg       0.69      0.72      0.70        90



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


That's OK :)

# Predict with the model

Here, we will predict on the test set (predicitions to send in), then save the results and the model.

**IMPORTANT NOTE**: In reality, you need to re-train your same model on the entire set to predict! However, I'm just using the same model as before, as it will bad anyways. ;)

In [29]:
post_test, thread_test = solution.prepare.load_dfs('test')

In [30]:
thread_test = thread_test.set_index('thread_num')
thread_test.head(2)

Unnamed: 0_level_0,thread_name,thread_replies
thread_num,Unnamed: 1_level_1,Unnamed: 2_level_1
126856,"Mafia Tools and Utilities (lurkertracker, etc)...",38
132415,Mafia Theory,211


Use the fitted CountVectorizer and other features to make our X dataframe:

In [31]:
word_vectors_raw_test = cv.transform(thread_test['thread_name'])

In [32]:
word_df_test = pd.DataFrame(word_vectors_raw_test.toarray(), columns=cv.get_feature_names(), index=thread_test.index)
word_df_test.head()

Unnamed: 0_level_0,10,12,13,14,15,18,19,alien,all,an,...,why,win,wins,winter,with,wizard,world,you,your,zombie
thread_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
126856,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
132415,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
134482,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
133728,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
134270,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
X_test = pd.concat([
        (thread_test['thread_replies'] + 1).rename('posts'), 
        np.log(thread_test['thread_replies'] + 1).rename('log_posts'), 
        word_df_test,
    ], axis='columns')
X_test.head()

Unnamed: 0_level_0,posts,log_posts,10,12,13,14,15,18,19,alien,...,why,win,wins,winter,with,wizard,world,you,your,zombie
thread_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
126856,39,3.663562,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
132415,212,5.356586,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
134482,475,6.163315,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
133728,564,6.335054,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
134270,11,2.397895,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Now we predict with our model, then paste it to a copy of `thread_test` as column `thread_label_id`.

In [34]:
y1_test_pred = pd.Series(
    cls1.predict(X_test), 
    index=X_test.index, 
)
y2_test_pred = pd.Series(
    cls2.predict(X_test), 
    index=X_test.index, 
)

In [35]:
y_test_pred = y2_test_pred.copy()
y_test_pred[y1_test_pred == 0] = label_map['other']

In [36]:
y_test_pred.head()

thread_num
126856    8
132415    8
134482    8
133728    1
134270    8
dtype: int64

In [37]:
result = thread_test.copy()
result['thread_label_id'] = y_test_pred
result.head()

Unnamed: 0_level_0,thread_name,thread_replies,thread_label_id
thread_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
126856,"Mafia Tools and Utilities (lurkertracker, etc)...",38,8
132415,Mafia Theory,211,8
134482,"Iron Diadem, Night One: Things Said Behind Bar...",474,8
133728,Beginner's Mafia XLIV: The Court of Colors | R...,563,1
134270,Mod Use #2,10,8


We need to reshape to conform to the submission format specified [here](https://www.kaggle.com/c/ni-mafia-gametype#evaluation).

In [38]:
result = result.reset_index()[['thread_num', 'thread_label_id']]
result.head()

Unnamed: 0,thread_num,thread_label_id
0,126856,8
1,132415,8
2,134482,8
3,133728,1
4,134270,8


# Export predictions, model

Our model consists of the text vectorizer `cv` and classifiers `cls1` and `cls2`. We already formatted our results, we just need to make sure not to write an extra index column.

In [39]:
# NOTE: Exporting next to the notebooks - the files are small, but usually you don't want to do this.
out_dir = os.path.abspath('2_output')
os.makedirs(out_dir, exist_ok=True)

In [40]:
result.to_csv(
    os.path.join(out_dir, 'anatoly_m2_predict.csv'),
    index=False, header=True, encoding='utf-8', 
)

In [41]:
import joblib

joblib.dump(cv, os.path.join(out_dir, 'cv.joblib'))
joblib.dump(cls1, os.path.join(out_dir, 'cls1.joblib'))
joblib.dump(cls2, os.path.join(out_dir, 'cls2.joblib'))
print("Done. :)")

Done. :)


# Final Remarks

The above submission got 64% on the public test set. And I didn't tune anything yet... :)