In [1]:
import numpy as np
import os

np.random.seed(23)

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import pandas as pd

In [2]:
#Read in the train and test data into dataframes
train = pd.read_csv('Datasets/MNIST/train.csv')
test = pd.read_csv('Datasets/MNIST/test.csv')

In [3]:
#Edit the dataframes into data and labels
x = train.drop('label', axis = 1)
y = train['label']

#Split the training data into training sets and test sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.8, random_state = 42)

In [4]:
#Test data with the decision tree classifier
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier(max_depth = 5)
classifier.fit(x_train,y_train)

print(classifier.score(x_test,y_test))

0.6699404761904761


In [6]:
#Simplify data, all values fit between 0 and 1 like we learned in class
x_train = x_train.applymap(lambda x: 1 if x > 0 else 0)
x_test = x_test.applymap(lambda x: 1 if x > 0 else 0)

In [7]:
#Check effects
classifier = DecisionTreeClassifier(max_depth = 5)
classifier.fit(x_train,y_train)

print(classifier.score(x_test,y_test))

0.6792559523809524


We can see simplifying the data had little efect on the decision tree, however, now we will attempt logistic regression which should reap the benefits of what we've done

In [10]:
#Test on the data with Logistic Regression now
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(x_train,y_train)

print(model.score(x_test,y_test))



0.8902678571428572


The difference between Logistic Regression and the Decision Tree are quite significant here

In [11]:
#Apply findings to test data
test.applymap(lambda x: 1 if x > 0 else 0)

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
#Predict results of the test data using Logistic Regression
results = model.predict(test)
results = pd.DataFrame(results)
results.index+=1
results.index.name='ImageID'
results.columns=['Label']
results.to_csv('results.csv',header = True)