### Let's try approaches proposed by dataset creators

Here I will look at solution proposed by a dataset creators, I will reuse their preprocessing except for one thing.
I'm going to use lemmatization instead of stemming

In [1]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer() 

In [2]:
with open('data/enron_train.txt') as f:
    train = f.readlines()
trainRaw = [x.strip() for x in train]

with open('data/enron_test.txt') as f:
    test = f.readlines()
testRaw = [x.strip() for x in test]

In [3]:
def cleanData(train):
    labels = [i.split('\t', 1)[0] for i in train]
    trainData = [i.split('\t', 1)[1] for i in train]

    ## stemming
    from stemming.porter2 import stem
    trainData = [" ".join([lemmatizer.lemmatize(word) for word in sentence.split(" ")]) for sentence in trainData]

    ## replacing http links with $LINK
    import re
    trainData = [re.sub(r"(<?)http:\S+", "$LINK", i) for i in trainData]

    ## replcaing money with $MONEY
    trainData = [re.sub(r"\$\d+", "$MONEY", i) for i in trainData]

    ## replcaing email ids with $EMAILID
    trainData = [re.sub(r'[\w\.-]+@[\w\.-]+', "$EMAILID", i) for i in trainData]

    ## Lowring the words
    trainData = [i.lower() for i in trainData]

    ## removing punctuations
    import regex as regex
    trainData = [regex.sub(r"[^\P{P}$]+", " ", i) for i in trainData]

    ## remove (unnecessary symbols)
    trainData = [re.sub(r"[^0-9A-Za-z/$' ]", " ", i) for i in trainData]
    
    ## replacing Weekdays with $day
    regString = r'monday|tuesday|wednesday|thursday|friday|saturday|sunday'
    trainData = [re.sub(regString, "$days", i) for i in trainData]
    
    ## replacing Months => $month
    regString = r'january|jan|february|feb|march|mar|april|june|july|august|aug|september|sept|october|oct|november|nov|december|dec'
    trainData = [re.sub(regString, "$month", i) for i in trainData]
    
    ## after before during => $time
    regString = r'after|before|during'
    trainData = [re.sub(regString, "$time", i) for i in trainData]
    
    ## replace numbers with $number
    trainData = [re.sub(r'\b\d+\b', "$number", i) for i in trainData]
    
    ## me, her, him ,us or them → $me,
    trainData = [re.sub(r'\b(me|her|him|us|them|you)\b', "$me", i) for i in trainData]
    
    ## striping whitespaces
    trainData = [i.strip() for i in trainData]
    
    return trainData, labels
    

In [4]:
def getTFIDF(data):
    from sklearn.feature_extraction.text import TfidfVectorizer
    vectorizer = TfidfVectorizer(min_df=1)
    X = vectorizer.fit_transform(data)
    return X
    
def getNgrams(txt, n):
    from nltk import ngrams
    ngrams = ngrams(txt.split(), n)
    l = []
    for grams in ngrams:
        l.append('_'.join(map(str,grams)))
    fl = ' '.join(l)
    return fl

## Cleaning train data
trainData, trainLabels = cleanData(trainRaw)

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from textblob.classifiers import NaiveBayesClassifier as NBC
from sklearn.metrics import confusion_matrix

df = pd.DataFrame({"labels": trainLabels, "trainData": trainData})

train, test = train_test_split(df, test_size = 0.25)


In [11]:
# for easier output
def pred_results(model, x_train, x_test, y_train, y_test):
    
    print(model)
    print("train accuracy", model.score(x_train, y_train))
    print("test accuracy", model.score(x_test, y_test))
    print()
    pred = model.predict(x_test)
    print("Confusion Matrix\n", confusion_matrix(y_test, pred))

In [12]:
trainData, trainLabels = cleanData(trainRaw)
testData, testLabels = cleanData(testRaw)

### Let's see how proposed solution works with the lemmatization

In [13]:
### Adding train and test datasets for SVM as SVM requires 
### same dimentions, number of features for training and test set
data = trainData + testData
labels = trainLabels + testLabels

## getting TFIDF matrix
X = getTFIDF(data)
Y = labels

### Spliting data 80% train - 20% test
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
  X, Y, test_size=0.2, random_state=42
)

## SVM Model
from sklearn.svm import SVC
## Value of C is calculated using grid search
svm = SVC(C=2300, kernel='rbf')
svm.fit(x_train, y_train)

pred_results(svm, x_train, x_test, y_train, y_test)

SVC(C=2300, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
train accuracy 0.8908308685130412
test accuracy 0.7989247311827957

Confusion Matrix
 [[467  76]
 [111 276]]


Here we get a tiny increase of accuracy
```
train accuracy: 0.888 vs 0.890 with lemmatizer
test accuracy: 0.790 vs 0.798 with lemmatizer
```

### Let's try something else

I'm proposing a gradient boosting classifier

In [14]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [67]:
gbc = GradientBoostingClassifier(n_estimators=200, subsample=0.8, random_state=13)
gbc.fit(x_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              presort='auto', random_state=13, subsample=0.8, verbose=0,
              warm_start=False)

In [68]:
pred_results(gbc, x_train, x_test, y_train, y_test)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              presort='auto', random_state=13, subsample=0.8, verbose=0,
              warm_start=False)
train accuracy 0.8822264049475665
test accuracy 0.7989247311827957

Confusion Matrix
 [[466  77]
 [110 277]]


Works as good as SVC

Now let's do a grid search in order to find the best solution

In [18]:
tuned_parameters = {'n_estimators': [100, 150, 180, 200, 220, 250],
                     'subsample': [0.8, 0.9, 1],
                     'min_samples_split': [2, 3, 4, 6],
                   }

In [43]:
grid_search_gbc = GridSearchCV(GradientBoostingClassifier(random_state=13),
                               tuned_parameters,
                               cv=5, # using 5 folds for cross-validation there
                               scoring='accuracy',
                               n_jobs=4)

In [44]:
grid_search_gbc.fit(X, Y)

GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=13, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'n_estimators': [100, 150, 180, 200, 220, 250], 'subsample': [0.8, 0.9, 1], 'min_samples_split': [2, 3, 4, 6]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [60]:
grid_search_gbc.best_params_

{'min_samples_split': 3, 'n_estimators': 220, 'subsample': 0.8}

Let's try these params on same split we did with SVC

In [62]:
gbc = GradientBoostingClassifier(n_estimators=220, subsample=0.8, min_samples_split=3, random_state=13)
gbc.fit(x_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=3,
              min_weight_fraction_leaf=0.0, n_estimators=220,
              presort='auto', random_state=13, subsample=0.8, verbose=0,
              warm_start=False)

In [63]:
pred_results(gbc, x_train, x_test, y_train, y_test)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=3,
              min_weight_fraction_leaf=0.0, n_estimators=220,
              presort='auto', random_state=13, subsample=0.8, verbose=0,
              warm_start=False)
train accuracy 0.8865286367303038
test accuracy 0.8032258064516129

Confusion Matrix
 [[468  75]
 [108 279]]


I made it above 80%

### Saving data in csv in order to reuse later

In [64]:
import pandas as pd
from collections import OrderedDict

In [65]:
train_df = pd.DataFrame(OrderedDict(
    {
        "text": trainData,
        "labels": trainLabels
    }
))

test_df = pd.DataFrame(OrderedDict(
    {
        "text": testData,
        "labels": testLabels
    }
))

In [69]:
train_df.to_csv("train.csv", index=False)
test_df.to_csv("test.csv", index=False)