# 1. Importing libraries

In [None]:
import re
import os
import time
import string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud 

In [None]:
from sklearn.manifold import TSNE
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.utils import resample
from sklearn import neural_network
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve, auc
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.cross_validation import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [None]:
import nltk
import gensim  
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk import SnowballStemmer, PorterStemmer, LancasterStemmer

  # 2. Import, inspect and cleaning training data

In [None]:
data = pd.read_csv('../input/train/train.csv')

In [None]:
data['type'].value_counts().plot(kind = 'bar')
plt.show()

In [None]:
## read the data
mbti = data

## split the rows
all_mbti = []
for i, row in mbti.iterrows():
    for post in row['posts'].split('|||'):
        all_mbti.append([row['type'], post])
all_mbti = pd.DataFrame(all_mbti, columns=['type', 'post'])

## remove urls
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = r'urlweb'
all_mbti['post'] = all_mbti['post'].replace(to_replace = pattern_url, value = subs_url, regex = True)

# make lower case
all_mbti['post'] = all_mbti['post'].str.lower()

#remove punctuation
import string
def remove_punctuation_numbers(post):
    punc_numbers = string.punctuation + '0123456789'
    return ''.join([l for l in post if l not in punc_numbers])
all_mbti['post'] = all_mbti['post'].apply(remove_punctuation_numbers)

In [None]:
all_mbti.head()

In [None]:
all_mbti.describe()

In [None]:
all_mbti['post_length'] = all_mbti['post'].apply(len)

In [None]:
all_mbti.head()

In [None]:
all_mbti.describe()

In [None]:
mu, sigma = 138, 65
# the histogram of the data
n, bins, patches = plt.hist(all_mbti['post_length'],  facecolor='green', alpha=0.75)
plt.xlabel('Post Character Length')
plt.title(r'$\mathrm{Distribution\ of\ Post\ Length:}\ \mu=138,\ \sigma=65$')
plt.axis([0, 200, 0, 160000])
plt.ylabel('Count')
plt.grid(True)
plt.show()

In [None]:
all_mbti.head()

In [None]:
all_mbti.describe()

In [None]:
all_mbti['post'][all_mbti['post_length'] == 200]

In [None]:
all_mbti = all_mbti[all_mbti.post != all_mbti['post'][90524]]
all_mbti = all_mbti[all_mbti.post != all_mbti['post'][178121]]
all_mbti = all_mbti[all_mbti.post != all_mbti['post'][196595]]
all_mbti = all_mbti[all_mbti.post != all_mbti['post'][270750]]
all_mbti = all_mbti[all_mbti.post != all_mbti['post'][97314]]
all_mbti = all_mbti[all_mbti.post != 'urlweb']
all_mbti = all_mbti[all_mbti.post_length != 1]

In [None]:
all_mbti.head()

# 3. Natural Language Processing

In [None]:
tokeniser = TreebankWordTokenizer()
all_mbti['tokens'] = all_mbti['post'].apply(tokeniser.tokenize)

In [None]:
all_mbti.head()

In [None]:
# find the stem of each word in words
stemmer = SnowballStemmer('english')

def mbti_stemmer(words, stemmer):
    "transforms all words in the given dataframe to their stems"
    
    return [stemmer.stem(word) for word in words] 

# Apply stemmer
all_mbti['stem'] = all_mbti['tokens'].apply(mbti_stemmer, args=(stemmer, ))

In [None]:
all_mbti.drop('tokens', axis=1, inplace=True)
all_mbti.head()

In [None]:
lemmatizer = WordNetLemmatizer()
def mbti_lemma(words, lemmatizer):
    "lemmatizes all words in dataframe"
    return [lemmatizer.lemmatize(word, pos='v') for word in words]

# Apply lemmatizer
all_mbti['lemma'] = all_mbti['stem'].apply(mbti_lemma, args=(lemmatizer, ))

In [None]:
all_mbti.drop('stem', axis=1, inplace=True)
all_mbti.head()

In [None]:
# Update stopwords with class names and other stop words without punctuation
stopset = set(stopwords.words('english'))
stopset.update(['istj', 'isfj', 'infj', 'intj', 'istp', 'isfp', 'infp', 'intp', 
                 'estp', 'esfp', 'enfp', 'entp', 'estj', 'esfj', 'enfj', 'entj', 
                'dtype', 'infps', 'theres', 'xxxjs', 'im', 'shes', 'arent', 'youve', 
                'youre', 'dont', 'urlweb'])


def remove_stop_words(tokens):
    "Removes stop words from the provided corpus"
    return [t for t in tokens if t not in stopset]

all_mbti['stem_stop'] = all_mbti['lemma'].apply(remove_stop_words)

In [None]:
all_mbti.drop('lemma', axis=1, inplace=True)
all_mbti.head()

In [None]:
def join_string(tokens):
    return " ".join(str(token) for token in tokens) 

all_mbti['join_stop'] = all_mbti['stem_stop'].apply(join_string)

In [None]:
all_mbti.drop('stem_stop', axis=1, inplace=True)
all_mbti.head()

In [None]:
plt.figure(figsize=(20,10))
wordcloud = WordCloud(background_color='white', mode = "RGB", width = 2000, height=1000).generate(str(all_mbti['join_stop']))
plt.title("MBTI_TRAIN")
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

# 4. Build Classification Models

## 4.1 Defining Features and Labels

In [None]:
all_mbti['I/E'] = all_mbti['type'].apply(lambda x: x[0] == 'I').astype('int')
all_mbti['N/S'] = all_mbti['type'].apply(lambda x: x[1] == 'N').astype('int')
all_mbti['F/T'] = all_mbti['type'].apply(lambda x: x[2] == 'F').astype('int')
all_mbti['P/J'] = all_mbti['type'].apply(lambda x: x[3] == 'P').astype('int')

In [None]:
all_mbti.head()

In [None]:
y_i = all_mbti['I/E']
y_n = all_mbti['N/S']
y_f = all_mbti['F/T']
y_p = all_mbti['P/J']

In [None]:
vect = CountVectorizer(min_df= .01)
X = vect.fit_transform(all_mbti['join_stop'])

In [None]:
type(X)

In [None]:
X.shape

### 4.2.1 Introvert - Extrovert

In [None]:
all_mbti['I/E'].value_counts().plot(kind = 'bar')
plt.title('Extrovert - Introvert')
plt.show()

In [None]:
n = 5000
X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(X[:n].toarray(), y_i[:n])

In [None]:
# if we assume that everyone is introverted would give us this accuracy
y_i_mean = np.mean(y_train_i)
y_i_mean

 ### 4.2.2 iNtuition - Sensing

In [None]:
all_mbti['N/S'].value_counts().plot(kind = 'bar')
plt.title(' Sensing - Intuition')
plt.show()

In [None]:
n = 5000
X_train_n, X_test_n, y_train_n, y_test_i = train_test_split(X[:n].toarray(), y_n[:n])

In [None]:
# if we assume that everyone is Intuitive give us this accuracy
y_n_mean = np.mean(y_train_n)
y_n_mean

> ### 4.2.3 Feeling - Thinking

In [None]:
all_mbti['F/T'].value_counts().plot(kind = 'bar')
plt.title('Thinking - Feeling')
plt.show()

In [None]:
n = 5000
X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(X[:n].toarray(), y_f[:n])

In [None]:
# if we assume that everyone is a 'Feeler' would give us this accuracy
y_f_mean = np.mean(y_train_f)
y_f_mean

### 4.2.4  Perception - Judgement

In [None]:
all_mbti['P/J'].value_counts().plot(kind = 'bar')
plt.title(' Judgement - Perception')
plt.show()

In [None]:
n = 5000
X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(X[:n].toarray(), y_p[:n])

In [None]:
# if we assume that everyone is perceptive would give us this accuracy
y_p_mean = np.mean(y_train_p)
y_p_mean

## 4.3  Dealing with Imbalanced Data 

In [None]:
ros = RandomOverSampler(ratio='auto', random_state=42)

In [None]:
print ('Introvert vs Extrovert Class Mean: {}'.format(y_i_mean))
print ('iNtuition vs Sensing Class Mean: {}'.format(y_n_mean))
print ('Feeling vs Thinking Class Mean: {}'.format(y_f_mean))
print ('Perception vs Judgement Class Mean: {}'.format(y_p_mean))

In [None]:
# Balancing the Introvert vs Extrovert Class
X_res_i, y_res_i = ros.fit_sample(X, y_i)

In [None]:
y_p_mean = np.mean(y_res_i)
y_p_mean

In [None]:
X_res_i.shape

In [None]:
y_res_i.shape

In [None]:
n = 5000
X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(X_res_i[:n].toarray(), y_res_i[:n])

In [None]:
# Balancing the iNtuition vs Sensing Class 
X_res_n, y_res_n = ros.fit_sample(X, y_n)

In [None]:
X_res_n.shape

In [None]:
X_res_n.shape

In [None]:
n = 5000
X_train_n, X_test_n, y_train_n, y_test_n = train_test_split(X_res_n[:n].toarray(), y_res_n[:n])

In [None]:
# Balancing the Feeling vs Thinking Class
X_res_f, y_res_f = ros.fit_sample(X, y_f)

In [None]:
X_res_f.shape

In [None]:
y_res_f.shape

In [None]:
# Balancing the perception vs Judgement Class
X_res_p, y_res_p = ros.fit_sample(X, y_p)

In [None]:
X_res_p.shape

In [None]:
y_res_p.shape

## 4.4 Import Model

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.colors import ListedColormap

# to automate the NLP extraction...
from sklearn.feature_extraction.text import CountVectorizer

# Cross_val_score is the new class for today...
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification

# main ones to focus on for this sprint
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# Covered in sprint 3
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

# Covered in sprint 4
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

# Neural Network!!
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

In [None]:
names = ['xgb','Logistic Regression', 'Nearest Neighbors',
         'Linear SVM', 'RBF SVM',
         'Naive Bayes', 'GBC', 'LDA', "QDA",
         "Decision Tree", "Random Forest",  "AdaBoost",
         "Neural Net"]

In [None]:
classifiers = [
    xgb.XGBClassifier(),
    LogisticRegression(),
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(kernel = 'rbf', gamma=1, C=1),
    GaussianNB(),
    GradientBoostingClassifier(n_estimators=20),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    MLPClassifier(hidden_layer_sizes=(400,), alpha=1, activation='relu', solver='adam',)
]

> ### 4.4.1 Model Fitting and Training

In [None]:
results = []

models = {}
confusion = {}
class_report = {}


for name, clf in zip(names, classifiers):
    print ('Fitting {:s} model...'.format(name))
    run_time = %timeit -q -o clf.fit(X_train_n, y_train_n)

    print ('... predicting')
    y_pred = clf.predict(X_test_n)
    y_pred_test = clf.predict(X_test_n)

    print ('... scoring')
    accuracy  = metrics.accuracy_score(y_test_n, y_pred)
    precision = metrics.precision_score(y_test_n, y_pred)
    recall    = metrics.recall_score(y_test_n, y_pred)

    f1        = metrics.f1_score(y_test_n, y_pred)
    f1_test   = metrics.f1_score(y_test_n, y_pred_test)

    # save the results to dictionaries
    models[name] = clf
    confusion[name] = metrics.confusion_matrix(y_test_n, y_pred)
    class_report[name] = metrics.classification_report(y_test_n, y_pred)

    results.append([name, accuracy, precision, recall, f1, f1_test, run_time.best])


results = pd.DataFrame(results, columns=['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1 Train', 'F1 Test', 'Train Time'])
results.set_index('Classifier', inplace= True)

In [None]:
results.sort_values('F1 Train', ascending=False)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
results.sort_values('F1 Train', ascending=False, inplace=True)
results.plot(y=['F1 Test'], kind='bar', ax=ax[0], xlim=[0,1.1])
results.plot(y='Train Time', kind='bar', ax=ax[1])

> ## 4.5 Model Validation and Evaluation

In [None]:

def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
    """
    Plot a confusion matrix.
    cm is the confusion matrix, names are the names of the classes.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
def plot_roc(pred,y):
    """
    Plot an ROC. pred - the predictions, y - the expected output.
    """
    fpr, tpr, _ = roc_curve(y, pred)
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc="lower right")
   

###  Frome the results above I have 5 best performing models,
### I am going to combine all of them and use a hard voting whereby predictions will be made based on what the majority of the models predict

In [None]:
group_essay = pd.read_csv('../input/group-essay/Essay_data.csv')

In [None]:
mbti_test['posts'] = mbti_test['posts'].str.lower()

In [None]:
m = VotingClassifier(
    estimators=[('sv', SVC(kernel='rbf', C=1, gamma=1)), 
                ('mp', MLPClassifier(hidden_layer_sizes=(400,), alpha=1, activation='relu', solver='adam',)),
                ('xg', xgb.XGBClassifier(max_depth=3,n_estimators=50, learning_rate=1 )), 
                ('rf', RandomForestClassifier(n_estimators=300,   oob_score=False, warm_start=True)),
                ('gb', GradientBoostingClassifier(n_estimators=20))], 
    voting='hard')

# 5 Model Evaluation 

> ## 5.1 Introvert vs Extrovert

In [None]:
n = 5000
X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(X_res_i[:n].toarray(), y_res_i[:n])

In [None]:
m.fit(X_train_i, y_train_i)

In [None]:
pred_i = m.predict(X_test_i)

In [None]:
print (confusion_matrix(y_test_i, pred_i))
cm = (confusion_matrix(y_test_i, pred_i))
names = ['Introvert','Extrovert']

plot_confusion_matrix(cm,names )

In [None]:
plot_roc(y_test_i, pred_i)

> ## 5.2 Intuition vs Sensing

In [None]:
n = 5000
X_train_n, X_test_n, y_train_n, y_test_n = train_test_split(X_res_n[:n].toarray(), y_res_n[:n])

In [None]:
m.fit(X_train_n, y_train_n)

In [None]:
pred_n = m.predict(X_test_n)

In [None]:
print (confusion_matrix(y_test_n, pred_n))
cm = (confusion_matrix(y_test_n, pred_n))
names = ['Intuitive','Sense']

plot_confusion_matrix(cm,names )

In [None]:
plot_roc(y_test_n, pred_n)

> ## 5.3 Feeling vs Thinking

In [None]:
n = 5000
X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(X_res_f[:n].toarray(), y_res_f[:n])

In [None]:
m.fit(X_train_f, y_train_f)

In [None]:
pred_f = m.predict(X_test_f)

In [None]:
print (confusion_matrix(y_test_f, pred_f))
cm = (confusion_matrix(y_test_f, pred_f))
names = ['Feel','Think']

plot_confusion_matrix(cm,names )

In [None]:
plot_roc(y_test_f, pred_f)

> ## 5.4 Perception vs Judging

In [None]:
n = 5000
X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(X_res_p[:n].toarray(), y_res_p[:n])

In [None]:
m.fit(X_train_p, y_train_p)

In [None]:
pred_p = m.predict(X_test_p)

In [None]:

print (confusion_matrix(y_test_p, pred_p))
cm = (confusion_matrix(y_test_p, pred_p))
names = ['Percieve','Judge']

plot_confusion_matrix(cm,names )

In [None]:
plot_roc(y_test_p, pred_p)

# 6 Test Data

> ## 6.1 Test Data Normalization

In [None]:
test_data = pd.read_csv('../input/group-essay/Essay_data.csv')

In [None]:
test_data.dropna()
test_data.head()

In [None]:
test_data['posts'] = test_data['posts'].apply(lambda x: x.replace('|||', ' '))

In [None]:
test_data.remane

In [None]:
test = test_data.dropna()

In [None]:

#test_data['posts'] = test_data['posts'].apply(lambda x: x.replace('|||', ' '))
## read the data
mbti_test = test_data

## remove urls
# pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
# subs_url = r'urlweb'
# mbti_test['posts'] = mbti_test['posts'].replace(to_replace = pattern_url, value = subs_url, regex = True)

# make lower case
mbti_test['Essay'] = mbti_test['Essay'].str.lower()

#remove punctuation
import string
def remove_punctuation_numbers(post):
    punc_numbers = string.punctuation + '0123456789'
    return ''.join([l for l in post if l not in punc_numbers])
mbti_test['Essay'] = mbti_test['Essay'].apply(remove_punctuation_numbers)

In [None]:
mbti_test.head()

In [None]:
mbti_test['post_length'] = mbti_test['Essay'].apply(len)

In [None]:
mbti_test.head()

In [None]:
tokeniser = TreebankWordTokenizer()
test_data['tokens'] = test_data['Essay'].apply(tokeniser.tokenize)

In [None]:
mbti_test.drop('type', axis=1, inplace=True)

In [None]:
mbti_test.head()

In [None]:
# find the stem of each word in words
stemmer = SnowballStemmer('english')

def mbti_stemmer(words, stemmer):
    return [stemmer.stem(word) for word in words] 

# stem all words in the mbti dataframe
mbti_test['stem'] = mbti_test['tokens'].apply(mbti_stemmer, args=(stemmer, ))

In [None]:
mbti_test.drop('tokens', axis=1, inplace=True)
mbti_test.head()

In [None]:
lemmatizer = WordNetLemmatizer()
def mbti_lemma(words, lemmatizer):
    return [lemmatizer.lemmatize(word, pos='v') for word in words]

# lemmatize all words in dataframe
mbti_test['lemma'] = mbti_test['stem'].apply(mbti_lemma, args=(lemmatizer, ))

In [None]:
mbti_test.drop('stem', axis=1, inplace=True)
mbti_test.head()

In [None]:
def remove_stop_words(tokens):    
    return [t for t in tokens if t not in stopset]

mbti_test['stem_stop'] = mbti_test['lemma'].apply(remove_stop_words)

In [None]:
mbti_test.drop('lemma', axis=1, inplace=True)
mbti_test.head()

In [None]:
def join_string(tokens):
    return " ".join(str(token) for token in tokens) 

mbti_test['join_stop'] = mbti_test['stem_stop'].apply(join_string)

In [None]:
mbti_test.drop('stem_stop', axis=1, inplace=True)
mbti_test.head()

In [None]:
mbti_test.describe()

In [None]:
mu, sigma = 138, 65
# the histogram of the data
n, bins, patches = plt.hist(mbti_test['post_length'], facecolor='green', alpha=0.75)
plt.xlabel('Post Character Length')
plt.title(r'$\mathrm{Distribution\ of\ Post\ Length:}\ \mu=6666,\ \sigma=1566$')
plt.axis([0, 9582, 0, 600])
plt.ylabel('Count')
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(20,10))
wordcloud = WordCloud(background_color='white', mode = "RGB", width = 2000, height=1000).generate(str(mbti_test['join_stop']))
plt.title("MBTI_TEST_DATA")
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [None]:
#vect = CountVectorizer(min_df= .01)
X_test_data = vect.transform(mbti_test['join_stop'])

In [None]:
X_test_data.shape

> ## 6.2 Test Predictions

> ## 6.2.1 Introvert Vs Extrovert

In [None]:
model_i = m.fit(X_res_i[:30000], y_res_i[:30000])

In [None]:
pred_i = model_i.predict(X_test_data)

In [None]:
len(pred_i)

In [None]:
pred_i[:10]

In [None]:
res_i = []
for i in pred_i:
    if i==1:
        res_i.append('E')
        
    else:
        res_i.append('I')

In [None]:
res_i[:10]

> ## 6.2.2 Intuitive Vs Sensing

In [None]:
model_n = m.fit(X_res_n[:30000], y_res_n[:30000])

In [None]:
pred_n = model_n.predict(X_test_data)

In [None]:
len(pred_n)

In [None]:
res_n = []
for i in pred_n:
    if i==1:
        res_n.append('N')
        
    else:
        res_n.append('S')

> ## 6.2.3 Feel Vs Thinking

In [None]:
model_f = m.fit(X_res_f[:30000], y_res_f[:30000])

In [None]:
pred_f = model_f.predict(X_test_data)

In [None]:
len(pred_f)

In [None]:
res_f = []
for i in pred_f:
    if i==1:
        res_f.append('T')
        
    else:
        res_f.append('F')

> ## 6.2.4 Perceive Vs Judge

In [None]:
model_p = m.fit(X_res_p[:30000], y_res_p[:30000])

In [None]:
pred_p = model_p.predict(X_test_data)

In [None]:
len(pred_p)

In [None]:
res_p = []
for i in pred_p:
    if i==1:
        res_p.append('J')
        
    else:
        res_p.append('P')

In [None]:
res_i = np.array(res_i)
res_n = np.array(res_n)
res_f = np.array(res_f)
res_p = np.array(res_p)

In [None]:
res_i = res_i.reshape(2169,1)
res_n = res_n.reshape(2169,1)
res_f = res_f.reshape(2169,1)
res_p = res_p.reshape(2169,1)

In [None]:
results = np.concatenate((res_i, res_n, res_f, res_p), axis=1 )


In [None]:
results = pd.DataFrame(results)

In [None]:
results.head()

In [None]:
results.describe()

In [None]:
results.to_csv('Tshepo_Moagi.csv', index=False)

# 7. Extras

* Additional pre-processing perfomed on this data was removing long words that turned out to be noise. I updated the stopwords with words that were missed the first round.
* Plotting  `word cloud` enables you to see the most frequent words, and also whether the stop words were removed or not, and lastly the effects of lemmatizing and and stemming. It is an essential NLP visual aid.
* I used the models from the train in addition I used `XGBoost` and bundled the best 5 models into a `voting classifier`, whereby each model makes a prediction but the final prediction is the one that the majority of the models predicted.
* Instead of printing out the confusion matrix I also plotted it in a heatmap as a visual aid.
* In order for a model to be better than a random guesser, it has to have `auc` that is greater than 0.5, so alongside the confusion matrix, I plotted an `roc` chart
* I used an upsampling method to balance the data.

In [None]:
df_test = pd.read_csv('../input/group-essay/Essay_data.csv')

In [None]:
df_test.head()

In [None]:

# make lower case
df_test['Essay'] = df_test['Essay'].str.lower()

In [None]:
df_test = df_test.dropna()

In [None]:
import string
def remove_punctuation_numbers(post):
    punc_numbers = string.punctuation + '0123456789'
    return ''.join([l for l in post if l not in punc_numbers])
df_test['Essay'] = df_test['Essay'].apply(remove_punctuation_numbers)

In [None]:
df_test.head()

In [None]:
plt.figure(figsize=(20,10))
wordcloud = WordCloud(background_color='white', mode = "RGB", width = 2000, height=1000).generate(str(df_test['Essay']))
plt.title("Academy Group Essay")
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [None]:
y_i = df_test['I/E']
y_n = df_test['N/S']
y_f = df_test['T/F']
y_p = df_test['J/P']

In [None]:
X_Group = vect.transform(df_test['Essay'])

# Introvert Group

In [None]:
pred_i = model_i.predict(X_Group)

In [None]:
len(pred_i)

In [None]:
res_i = []
for i in pred_i:
    if i==1:
        res_i.append('I')
        
    else:
        res_i.append('E')

In [None]:
print (confusion_matrix(y_i, res_i))

# Intuitive Group

In [None]:
pred_n = model_n.predict(X_Group)

In [None]:
len(pred_n)

In [None]:
res_n = []
for i in pred_n:
    if i==1:
        res_n.append('N')
        
    else:
        res_n.append('S')

In [None]:
print (confusion_matrix(y_n, res_n))

# Feelers Group

In [None]:
pred_f = model_f.predict(X_Group)

In [None]:
pred_f

In [None]:
len(y_f)

In [None]:
len(res_f)

In [None]:
res_f = []
for i in pred_f:
    if i==1:
        res_f.append('F')
        
    else:
        res_f.append('T')
print (confusion_matrix(y_f, res_f))

# Perceive Group

In [None]:
pred_p = model_p.predict(X_Group)

In [None]:
res_p = []
for i in pred_p:
    if i==1:
        res_p.append('P')
        
    else:
        res_p.append('J')
print (confusion_matrix(y_p, res_p))