In [None]:

import numpy as np
import pandas as pd 


df = pd.read_csv('../input/xAPI-Edu-Data.csv')
# Any results you write to the current directory are saved as output.
df.head()

In [None]:
print(df.shape)

In [None]:
df.isnull().sum()

**Data Visualization and Exploration**


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
#breakdown by class
sns.countplot(x="Topic", data=df, palette="muted");
plt.show()

In [None]:
df['Failed'] = np.where(df['Class']=='L',1,0)
sns.factorplot('Topic','Failed',data=df,size=9)

In [None]:
pd.crosstab(df['Class'],df['Topic'])

It appears that no one failed Geology while students in IT, Chemistry, and Math had the highest probability of failing. 

In [None]:
sns.countplot(x='Class',data=df,palette='PuBu')
plt.show()

In [None]:
df.Class.value_counts()

In [None]:
sns.countplot(x='ParentschoolSatisfaction',data = df, hue='Class',palette='bright')
plt.show()

In [None]:
sns.factorplot('Relation','Failed',data=df)

From the students who did well, a large majority of their parents were satisfied with the education they received. The students whose parents were least satisfied with the school performed much worse. The students whose mothers were responsible for them had a higher chance of performing well. 

In [None]:
sns.factorplot("gender","Failed",data=df)

In [None]:
Raised_hand = sns.boxplot(x="Class", y="raisedhands", data=df)
Raised_hand = sns.swarmplot(x="Class", y="raisedhands", data=df, color=".15")
plt.show()

In [None]:
Facetgrid = sns.FacetGrid(df,hue='Failed',size=6)
Facetgrid.map(sns.kdeplot,'raisedhands',shade=True)
Facetgrid.set(xlim=(0,df['raisedhands'].max()))
Facetgrid.add_legend()


The boxplot analysis indicates that those who did well were more active in class, and the worst performers were the least active.

In [None]:
ax = sns.boxplot(x="Class", y="Discussion", data=df)
ax = sns.swarmplot(x="Class", y="Discussion", data=df, color=".25")
plt.show()

In [None]:
Facetgrid = sns.FacetGrid(df,hue='Failed',size=7)
Facetgrid.map(sns.kdeplot,'Discussion',shade=True)
Facetgrid.set(xlim=(0,df['Discussion'].max()))
plt.show()

In [None]:
Vis_res = sns.boxplot(x="Class", y="VisITedResources", data=df)
Vis_res = sns.swarmplot(x="Class", y="VisITedResources", data=df, color=".25")
plt.show()

In [None]:
Facetgrid = sns.FacetGrid(df,hue='Failed',size=7)
Facetgrid.map(sns.kdeplot,'VisITedResources',shade=True)
Facetgrid.set(xlim=(0,df['VisITedResources'].max()))
plt.show()

In [None]:
Anounce_bp = sns.boxplot(x="Class", y="AnnouncementsView", data=df)
Anounce_bp = sns.swarmplot(x="Class", y="AnnouncementsView", data=df, color=".25")
plt.show() 

In [None]:
Facetgrid = sns.FacetGrid(df,hue='Failed',size=7)
Facetgrid.map(sns.kdeplot,'AnnouncementsView',shade=True)
Facetgrid.set(xlim=(0,df['AnnouncementsView'].max()))
plt.show()

It is clear that the lowest performers rarely visited the course resources. The swarmplot shapes also indicates that the highest and lowest performers had the most consistent habits with respect to viewing the course resources. It also appears that less students from all groups viewed course announcements, but there is still a clear pattern with viewing course announcements and how well the student performed. 

Now that we see the student's academic behavior by marks,  let's go back and see if this pattern is consistent in the Geology class.

In [None]:
df.groupby('Topic').median()


In [None]:
df['AbsBoolean'] = df['StudentAbsenceDays']
df['AbsBoolean'] = np.where(df['AbsBoolean'] == 'Under-7',0,1)
df['AbsBoolean'].groupby(df['Topic']).mean()

The Geology students seemed to participate more frequently than those in other subjects and attended class more than those in any other subject, which could explain why none of the Geology students failed.

In [None]:
df[9:13].describe()

**Classification**
First, the perceptron and SVC will be trained on the continuous data.

In [None]:
df['TotalQ'] = df['Class']
df['TotalQ'].loc[df.TotalQ == 'Low-Level'] = 0.0
df['TotalQ'].loc[df.TotalQ == 'Middle-Level'] = 1.0
df['TotalQ'].loc[df.TotalQ == 'High-Level'] = 2.0

continuous_subset = df.ix[:,9:13]

X = np.array(continuous_subset).astype('float64')
y = np.array(df['TotalQ'])
X.shape

In [None]:
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler


X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.3, random_state=0)


sc = StandardScaler()

sc.fit(X_train)

X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [None]:
from sklearn.linear_model import Perceptron

ppn = Perceptron(n_iter=40, eta0=0.1, random_state=0)
ppn.fit(X_train_std, y_train)
y_pred = ppn.predict(X_test_std)
print('Misclassified samples: %d' % (y_test != y_pred).sum())

In [None]:
from sklearn.metrics import accuracy_score
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

**Linear SVC**

In [None]:
from sklearn.svm import SVC

svm = SVC(kernel='linear', C=2.0, random_state=0)
svm.fit(X_train_std, y_train)

y_pred = svm.predict(X_test_std)
print('Misclassified samples: %d' % (y_test != y_pred).sum())

In [None]:

print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

In [None]:

print(classification_report(y_test, y_pred))

**Non-linear SVC with rbf**

In [None]:
svm = SVC(kernel='rbf', random_state=0, gamma=2, C=1.0)
svm.fit(X_train_std, y_train)
y_pred = svm.predict(X_test_std)
print('Misclassified samples: %d' % (y_test != y_pred).sum())

In [None]:
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

The rbf SVC model performed the best on the dataset. One possible contributor to this could be that no outliers were removed.


Taking a step back, let's look at some of the categorical data. 

In [None]:
sns.countplot(x='StudentAbsenceDays',data = df, hue='Class',palette='bright')
plt.show()

In [None]:
sns.factorplot('StudentAbsenceDays','Failed',data=df)

The biggest visual trend can be seen in how frequently the student was absent. Over 90% of the students who did poorly were absent more than seven times, while almost none of the students who did well were absent more than seven times.

We will create a dummy variable for this category, and include it in our model. 

Although parent satisfaction showed a huge pattern with respect to how well a student did in the class, there is no information on whether or not the survey was taken after grades were posted, and furthermore the attribute does not give any information about the student's classroom behavior so it was left out. 

In [None]:

continuous_subset['Absences'] = df['AbsBoolean']
X = np.array(continuous_subset).astype('float64')
y = np.array(df['TotalQ'])
X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.3, random_state=0)
sc = StandardScaler()

sc.fit(X_train)

X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [None]:
svm.fit(X_train_std, y_train)

y_pred = svm.predict(X_test_std)
print('Misclassified samples: %d' % (y_test != y_pred).sum())

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
df.loc[(df['raisedhands']==2) & (df['VisITedResources']==9) & (df['AnnouncementsView']==7)]

In the previous version of the dataset, there was an observation with 'Total' value of '76' that is highlighted above. This was removed during exploration, and resulted in the Perceptron achieving a 57% accuracy, indicating that the outlier is influencing the accuracy of the linear classification methods. 

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
#clf = MLPClassifier(solver='lbfgs',alpha=1e-5,random_state=1)

In [None]:
sc = StandardScaler()
sc.fit(X)

In [None]:
clf = MLPClassifier(solver='lbfgs',alpha=.1,random_state=1)
clf.fit(X,y)
scores=cross_val_score(clf,X,y,cv=10)

In [None]:
print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))

Lastly, I will reproduce the original results I got during my first run through without the outlier to demonstrate it's affect on the Perceptron.

In [None]:
df = df.drop(df.index[[126]])
df.shape

In [None]:
df.loc[(df['raisedhands']==2) & (df['VisITedResources']==9) & (df['AnnouncementsView']==7)]

In [None]:
from sklearn.linear_model import Perceptron

ppn = Perceptron(n_iter=40, eta0=0.1, random_state=0)
ppn.fit(X_train_std, y_train)
y_pred = ppn.predict(X_test_std)
print('Misclassified samples: %d' % (y_test != y_pred).sum())

In [None]:
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))