In [1]:
"""
Data loading
"""
import pandas as pd
df = pd.read_csv('./datasets/train.tsv', sep='\t', header=0)

# Removing Id feature because a sequential feature like this is not good.
df = df.drop('Id', 1)


In [None]:
"""
Graphs Plotting
"""

%matplotlib inline
import seaborn as sns
sns.set(color_codes=True)

# Looping on features excluding the 'Label' field
for i, column in enumerate(df.drop('Label', 1)):
    if(df[column].dtypes == "object"):
        sns.plt.figure(i)
        g = sns.countplot(y=column, hue="Label", data=df, palette="Greens_d");
        g.axes.set_title("Histogram for " + column, fontsize=24,alpha=0.5)
    else:
        sns.plt.figure(i)
        g = sns.boxplot(y=column, x="Label", data=df);
        g.axes.set_title("Box plot for " + column, fontsize=24,alpha=0.5)



In [4]:
"""
Data preparation for classification.
"""

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

X = df.drop('Label', 1)
X = pd.get_dummies(X)
features = list(X)

# X_train, X_test, y_train, y_test = train_test_split(X, df.Label, test_size=0.5, random_state=42)




In [24]:
"""
Random Forest Classification.
"""

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=50)

scores = cross_val_score(rf, X, df['Label'], cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


rf.fit_transform(X_train, y_train)
print("Features sorted by their score:")
print(sorted(zip(map(lambda x: round(x, 3), rf.feature_importances_), features), reverse=True))

# print(rf.score(X_test, y_test))

Accuracy: 0.76 (+/- 0.08)
Features sorted by their score:
[(0.101, 'Attribute5'), (0.079, 'Attribute13'), (0.068, 'Attribute2'), (0.05, 'Attribute1_A14'), (0.035, 'Attribute8'), (0.033, 'Attribute1_A11'), (0.033, 'Attribute11'), (0.029, 'Attribute1_A12'), (0.02, 'Attribute4_A40'), (0.018, 'Attribute7_A72'), (0.018, 'Attribute4_A43'), (0.018, 'Attribute3_A34'), (0.017, 'Attribute9_A93'), (0.017, 'Attribute6_A61'), (0.017, 'Attribute3_A32'), (0.016, 'Attribute7_A75'), (0.016, 'Attribute17_A173'), (0.016, 'Attribute16'), (0.015, 'Attribute9_A92'), (0.014, 'Attribute7_A73'), (0.014, 'Attribute3_A31'), (0.014, 'Attribute15_A152'), (0.014, 'Attribute12_A123'), (0.014, 'Attribute12_A122'), (0.014, 'Attribute12_A121'), (0.013, 'Attribute6_A65'), (0.013, 'Attribute4_A42'), (0.013, 'Attribute19_A192'), (0.013, 'Attribute19_A191'), (0.013, 'Attribute17_A172'), (0.013, 'Attribute12_A124'), (0.012, 'Attribute7_A74'), (0.012, 'Attribute3_A30'), (0.012, 'Attribute14_A143'), (0.011, 'Attribute4_A46'),



In [13]:
"""
Gaussian Naive-Bayes Classification.
"""


from sklearn.naive_bayes import GaussianNB

rf = GaussianNB()
scores = cross_val_score(rf, X, df['Label'], cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# rf.fit(X_train, y_train)
# print(rf.score(X_test, y_test))

Accuracy: 0.70 (+/- 0.06)


In [14]:
"""
Multinomial Naive-Bayes Classification.
"""
from sklearn.naive_bayes import MultinomialNB

rf = MultinomialNB()
scores = cross_val_score(rf, X, df['Label'], cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# rf.fit(X_train, y_train)
# print(rf.score(X_test, y_test))

Accuracy: 0.63 (+/- 0.11)


In [16]:
"""
Bernouli Naive-Bayes Classification.
"""
from sklearn.naive_bayes import BernoulliNB

rf = BernoulliNB()
scores = cross_val_score(rf, X, df['Label'], cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# rf.fit(X_train, y_train)
# print(rf.score(X_test, y_test))

Accuracy: 0.73 (+/- 0.09)


In [21]:
"""
SVC SVM Classification.
"""

from sklearn.svm import SVC

rf = SVC()
scores = cross_val_score(rf, X, df['Label'], cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# rf.fit(X_train, y_train)
# print(rf.score(X_test, y_test))

Accuracy: 0.69 (+/- 0.03)
