In [1]:
import os, re
import numpy as np
import pandas as pd
import nltk

from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')

from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer() 

mypath = "./Case Presentation 1"
files = os.listdir(mypath)
from nltk.stem import PorterStemmer

def txt_to_df(files):
    df = pd.DataFrame(columns=["content", "label"])
    for f in files:
        txtFile = open('./Case Presentation 1/'+f, 'r')
        tempTxt = txtFile.read().splitlines()
        flag = False
        content = ""
        
        for line in tempTxt:
            if line[-1:] == ":" : 
                flag = True
                continue
            if flag:
                content += (" " + line)
            
            content = re.sub(r'[^A-Za-z\s]',r' ',content)
            # Lowercase
            content = " ".join([w.lower() for w in content.split()])
            # Remove Stop
            content = " ".join([w for w in content.split() if w not in stop_words])
            # Stemming
            #st = PorterStemmer()
            #content = " ".join([st.stem(w) for w in content.split()])
            # 
            content = " ".join([lemmatizer.lemmatize(w, pos='v') for w in content.split()])
            #content = " ".join([lemmatizer.lemmatize(w, pos='n') for w in content.split()])
                
        if f[:3] == 'CUR':
            label = 'smoker'
        elif f[:3] == 'PAS':
            label = 'past_smoker'
        elif f[:3] == 'NON':
            label = 'non_smoker'
        else:
            label = 'unknown'
            
        df = df.append({
            "content": content,
            "label": label,

        }, ignore_index=True)
    return df

files = os.listdir("./Case Presentation 1/")
df = txt_to_df(files)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wendy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\wendy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df_x = pd.DataFrame(df.iloc[:,0])
df_y = np.array(df.iloc[:,1])

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df_x, df_y, test_size=8, stratify=df_y)

In [4]:
# need survey how does TF-IDF work, not yet done
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

corpus_train = list(X_train["content"])
vectorizer = CountVectorizer(stop_words=stop_words, analyzer='word', max_features=500)
X = vectorizer.fit_transform(corpus_train) 

transformer = TfidfTransformer(smooth_idf=True)
tfidf = transformer.fit_transform(X)
r = pd.DataFrame(tfidf.toarray(),columns=vectorizer.get_feature_names())

corpus_val = list(X_val["content"])
X_val = vectorizer.transform(corpus_val) 
tfidf_val = transformer.transform(X_val)
r_val = pd.DataFrame(tfidf_val.toarray(),columns=vectorizer.get_feature_names())


## SVM

In [49]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

svc = svm.SVC()

grid = GridSearchCV(estimator=svc, 
                    param_grid={'C': [1, 5, 10, 50, 100], 
                                'kernel': ('linear', 'rbf', 'poly', 'sigmoid')}, cv=4, scoring='accuracy', verbose=1)

grid_result = grid.fit(r, y_train)
grid_result.best_params_

Fitting 4 folds for each of 20 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    0.2s finished


{'C': 1, 'kernel': 'poly'}

In [6]:
from sklearn.metrics import accuracy_score

predict = grid_result.best_estimator_.predict(r_val)
accuracy_score(predict, y_val)

0.625

## Decision Tree

In [22]:
from sklearn import tree

tree = tree.DecisionTreeClassifier

grid = GridSearchCV(estimator=tree(),
             param_grid={'criterion':['gini','entropy'],
                         'splitter':['best', 'random'],
                         'max_depth':[1,5,10,15,20,25,30]}, cv=4, scoring='accuracy', verbose=1)

grid_result = grid.fit(r, y_train)
grid_result.best_params_

Fitting 4 folds for each of 28 candidates, totalling 112 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 112 out of 112 | elapsed:    0.4s finished


{'criterion': 'gini', 'max_depth': 15, 'splitter': 'random'}

In [23]:
predict = grid_result.best_estimator_.predict(r_val)
accuracy_score(predict, y_val)

0.75

## Random Forest

In [57]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier

grid = GridSearchCV(estimator=forest(),
             param_grid={'n_estimators':[10,20,50,100],
                         'max_depth':[1,5,10,15,20,25,30],
                         'criterion':['gini', 'entropy']}, cv=4, scoring='accuracy', verbose=1)

grid_result = grid.fit(r, y_train)
grid_result.best_params_

Fitting 4 folds for each of 56 candidates, totalling 224 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 224 out of 224 | elapsed:   12.6s finished


{'criterion': 'gini', 'max_depth': 10, 'n_estimators': 100}

In [58]:
predict = grid_result.best_estimator_.predict(r_val)
accuracy_score(predict, y_val)

0.875

## Logistic Regression

In [53]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

grid = GridSearchCV(estimator=lr,
             param_grid={'penalty':['l1', 'l2'],
                         'C':np.logspace(-3,3,7),
                         'solver':['liblinear']},cv=4, scoring='accuracy', verbose=1)

grid_result = grid.fit(r, y_train)
grid_result.best_params_

Fitting 4 folds for each of 14 candidates, totalling 56 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  56 out of  56 | elapsed:    0.1s finished


{'C': 1000.0, 'penalty': 'l1', 'solver': 'liblinear'}

In [56]:
predict = grid_result.best_estimator_.predict(r_val)
accuracy_score(predict, y_val)

0.5