In [1]:
import numpy as np
import pandas as pd
import os
import sklearn
import missingno as msno

# Reading Data

In [2]:
def segmentWords(s): 
    return s.split()

def readFile(fileName):
    # Function for reading file
    # input: filename as string
    # output: contents of file as list containing single words
    contents = []
    f = open(fileName)
    for line in f:
        contents.append(line)
    f.close()
    result = segmentWords('\n'.join(contents))
    return result

#### Create a Dataframe containing the counts of each word in a file

In [3]:
d = []

for c in os.listdir("data_training"):
    directory = "data_training/" + c
    for file in os.listdir(directory):
        words = readFile(directory + "/" + file)
        e = {x:words.count(x) for x in words}
       # e['__FileID__'] = file
        val = -1
        if directory == 'data_training/pos':
            val = 1
        elif directory == 'data_training/neg':
            val = 0
        e['__CLASS__'] = val
        d.append(e)

Create a dataframe from d - make sure to fill all the nan values with zeros.

References:

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.fillna.html


In [4]:
df = pd.DataFrame(data=d)
# Just to prove that there are some numbers in the dataset
print(df.shape)
print(sum(pd.isnull(df["they"])))
df = df.replace('NaN', 0)

sm = 0
for index, row in df.iterrows():
    sm += sum(pd.isnull(row))
    
print(sm)
print(df.describe())

(1600, 45672)
447
0
                        earth     goodies          if      ripley  \
count  1600.000000  1600.000000  1600.000000  1600.000000  1600.000000   
mean      0.000625     0.000625     0.000625     0.000625     0.000625   
std       0.025000     0.025000     0.025000     0.025000     0.025000   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       0.000000     0.000000     0.000000     0.000000     0.000000   
50%       0.000000     0.000000     0.000000     0.000000     0.000000   
75%       0.000000     0.000000     0.000000     0.000000     0.000000   
max       1.000000     1.000000     1.000000     1.000000     1.000000   

          suspend        they      white                         \
count  1600.000000  1600.000000  1600.000000  1600.000000  1600.00000   
mean      0.000625     0.000625     0.000625     0.003125     0.00750   
std       0.025000     0.025000     0.025000     0.074958     0.25492   
min       0.000000   

#### Split data into training and validation set 

* Sample 80% of your dataframe to be the training data

* Let the remaining 20% be the validation data (you can filter out the indicies of the original dataframe that weren't selected for the training data)

References:

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sample.html
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html

In [21]:
def shuffleAndSplit(data):
    length = .8 * data.shape[0]
    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)
    return (data.iloc[indices[:length]], data.iloc[indices[length:]])

#Shuffle original dataframe and split into test
# and validation sets
dfTest, dfVal = shuffleAndSplit(df)
dfTest = pd.DataFrame(data=dfTest)
dfVal = pd.DataFrame(data=dfVal)
print(dfTest.shape, dfVal.shape)

((1280, 45672), (320, 45672))




* Split the dataframe for both training and validation data into x and y dataframes - where y contains the labels and x contains the words

References:

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html

In [22]:
#Test data and labels
testData = dfTest.drop('__CLASS__', axis=1)
testLabels = dfTest['__CLASS__']
print(testData.shape, testLabels.shape)
#Validation datasets
valData = dfVal.drop('__CLASS__', axis=1)
valLabels = dfVal['__CLASS__']
print(valData.shape, valLabels.shape)

((1280, 45671), (1280,))
((320, 45671), (320,))


# Logistic Regression

#### Basic Logistic Regression
* Use sklearn's linear_model.LogisticRegression() to create your model.
* Fit the data and labels with your model.
* Score your model with the same data and labels.

References:

http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [23]:
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(penalty='l2')
model.fit(testData, testLabels)
print(model.score(valData, valLabels))

0.8625


#### Changing Parameters

In [24]:
#Lets adjust C for more regularization
model = LogisticRegression(penalty='l2', C=3)
model.fit(testData, testLabels)
print(model.score(valData, valLabels))
model = LogisticRegression(penalty='l2', C=8)
model.fit(testData, testLabels)
print(model.score(valData, valLabels))
#We can see that adding a regularization term does not seem to help
# increase accuracy slightly, indicating that the inaccuracy
# was not caused by

0.85625
0.85


#### Feature Selection
* In the backward stepsize selection method, you can remove coefficients and the corresponding x columns, where the coefficient is more than a particular amount away from the mean - you can choose how far from the mean is reasonable.

References:

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html#
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sample.html
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html
http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.where.html
https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.std.html
https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.mean.html

In [None]:
weights = model.coef_[0]
total = sum(weights)
print(len(weights))
norm = np.asarray(weights) / total
bad = [x for x in range(len(norm)) if norm[x] < .00000001]
print(len(bad))

f = open('/home/porrster/Data-Science-Decal-Fall-2017/Assignments/Project2_DSDFa17/badcols', 'w')

model = LogisticRegression(penalty='l2', C=8)
model.fit(testData, testLabels)
errBase = model.score(valData, valLabels)
cols = testData.columns
batchsize = int(len(bad) * .01)
for i in range(len(bad) // batchsize):
    testData0 = testData.drop(cols[(i*batchsize):((i+1)*batchsize)], axis=1)
    valData0 = valData.drop(cols[(i*batchsize):((i+1)*batchsize)], axis=1)
    model.fit(testData0, testLabels)
    err = model.score(valData0, valLabels)
    if errBase <= err:
        for count in range(batchsize):
            ind = (i * batchsize) + count
            testData0 = testData.drop(cols[ind], axis=1)
            valData0 = valData.drop(cols[ind], axis=1)
            model.fit(testData0, testLabels)
            err = model.score(valData0, valLabels)
            if errBase <= err:
                testData = testData0
                valData = valData0
    print(i)
for i in range(len(bad) - (len(bad) // batchsize) * 100):
    testData0 = testData.drop(cols[(i*batchsize):((i+1)*batchsize)], axis=1)
    valData0 = valData.drop(cols[(i*batchsize):((i+1)*batchsize)], axis=1)
    model.fit(testData0, testLabels)
    err = model.score(valData0, valLabels)
    if errBase <= err:
        testData = testData0
        valData = valData0
f.write(testdata.columns)
f.close()
print(len(testData))    
  

45671
25036


How did you select which features to remove? Why did that reduce overfitting?

In [14]:
cols = testData.columns
#keepCols = [cols[x] for x in keep]
#testDataClean = testData[keepCols[:]]
#valDataClean = valData[keepCols[:]]

print(testData.shape, testLabels.shape)
model = LogisticRegression(penalty='l2', C=1)
model.fit(testData, testLabels)
print(model.score(valData, valLabels))

((1280, 24503), (1280,))
0.821875


# Single Decision Tree

#### Basic Decision Tree

* Initialize your model as a decision tree with sklearn.
* Fit the data and labels to the model.

References:

http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html


#### Changing Parameters
* To test out which value is optimal for a particular parameter, you can either loop through various values or look into sklearn.model_selection.GridSearchCV

References:


http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

How did you choose which parameters to change and what value to give to them? Feel free to show a plot.

Why is a single decision tree so prone to overfitting?

# Random Forest Classifier

#### Basic Random Forest

* Use sklearn's ensemble.RandomForestClassifier() to create your model.
* Fit the data and labels with your model.
* Score your model with the same data and labels.

References:

http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html


#### Changing Parameters

What parameters did you choose to change and why?

How does a random forest classifier prevent overfitting better than a single decision tree?