In [75]:
# Load Train and Test Data
import pandas as pd
import numpy as np

train_df = pd.read_csv("train-rateit.csv")

test_df = pd.read_csv("test-rateit.csv")

In [76]:
# Separate features and label
train_label = train_df['target_bin']
train_features = train_df.drop(['target_bin', 'content_id'], axis=1)

test_features = test_df.drop(['content_id'], axis=1)

In [77]:
train_features.columns

Index([u'avg_word_length', u'day_of_week', u'feat1', u'feat2', u'feat3',
       u'feat4', u'feat5', u'feat6', u'feat7', u'feat8', u'feat9', u'feat10',
       u'feat11', u'feat12', u'feat13', u'feat14', u'feat15', u'feat16',
       u'feat17', u'feat18', u'feat19', u'feat20', u'feat21', u'feat22',
       u'feat23', u'feat24', u'feat25', u'feat26', u'feat27', u'feat28',
       u'feat29', u'feat30', u'images', u'meta_length', u'negativity',
       u'negativity2', u'negativity3', u'negativity4', u'negativity5',
       u'num_content_words', u'num_links', u'num_links_2', u'num_title_words',
       u'num_uniq_content_words', u'positivity', u'positivity2',
       u'positivity3', u'positivity4', u'positivity5', u'ratio_non_stop_words',
       u'ratio_uniq_non_stop_words', u'sentiment', u'sentiment2', u'topic_1',
       u'topic_2', u'topic_3', u'topic_4', u'topic_5', u'topic_6',
       u'tracked_for_days', u'videos'],
      dtype='object')

In [78]:
# Fill empty values with zero
train_features.fillna(0, inplace=True)
test_features.fillna(0, inplace=True)

In [79]:
# Fill na in day of week where day of week is 0
train_features['day_of_week'] = train_features['day_of_week'].apply(lambda x: "NA" if x==0 else x)
test_features['day_of_week'] = test_features['day_of_week'].apply(lambda x: "NA" if x==0 else x)

In [80]:
# One-Hot Encode day of week
train_features = pd.get_dummies(train_features)
test_features = pd.get_dummies(test_features)

In [81]:
# %matplotlib inline

# import matplotlib.pyplot as plt
# plt.rcParams['figure.figsize'] = [50, 50]

# #fig = plt.figure(figsize = (11,5))
# train_features.hist()
# plt.show()

In [82]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_features, train_label, test_size=0.20)

In [83]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 4)
pca.fit(X_train)

X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

In [84]:
# Try Random Forest
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

**Random Forest has given results**

In [85]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred)

0.690775018167828

In [86]:
# TODO: Import 'GridSearchCV', 'make_scorer', and any other necessary libraries
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer

# TODO: Initialize the classifier
clf = RandomForestClassifier()

# TODO: Create the parameters list you wish to tune, using a dictionary if needed.
# HINT: parameters = {'parameter_1': [value1, value2], 'parameter_2': [value1, value2]}
parameters = {'n_estimators': [10, 50, 75, 100], 'max_depth': [2, 25, 50, 75, 100]}

# TODO: Make an fbeta_score scoring object using make_scorer()
scorer = make_scorer(roc_auc_score)

# TODO: Perform grid search on the classifier using 'scorer' as the scoring method using GridSearchCV()
grid_obj = GridSearchCV(clf, parameters, scorer)

# TODO: Fit the grid search object to the training data and find the optimal parameters using fit()
grid_fit = grid_obj.fit(X_train, y_train)

# Get the estimator
best_clf = grid_fit.best_estimator_
print best_clf
# Make predictions using the unoptimized and model
predictions = (clf.fit(X_train, y_train)).predict(X_test)
best_predictions = best_clf.predict(X_test)

# Report the before-and-afterscores
print("Unoptimized model\n------")
print("ROC AUC score on testing data: {:.4f}".format(roc_auc_score(y_test, predictions)))
print("\nOptimized Model\n------")
print("Final ROC AUC score on the testing data: {:.4f}".format(roc_auc_score(y_test, best_predictions)))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=100, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


Unoptimized model
------
ROC AUC score on testing data: 0.6858

Optimized Model
------
Final ROC AUC score on the testing data: 0.7171


In [32]:
# Get Feature importances
importances = pd.DataFrame({'feature': X_train.columns, 'importance':np.round(best_clf.feature_importances_, 3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
importances.index.values

array(['feat9', 'feat8', 'feat23', 'feat16', 'feat18', 'feat17',
       'tracked_for_days', 'feat6', 'feat12', 'feat14',
       'num_content_words', 'num_uniq_content_words',
       'ratio_uniq_non_stop_words', 'feat15', 'avg_word_length', 'feat13',
       'feat3', 'feat22', 'feat2', 'feat10', 'sentiment', 'positivity3',
       'feat11', 'positivity', 'feat7', 'negativity3', 'num_links',
       'feat28', 'feat29', 'positivity2', 'negativity', 'feat27',
       'negativity2', 'num_title_words', 'images', 'feat24', 'feat30',
       'feat4', 'sentiment2', 'feat25', 'feat26', 'negativity5',
       'num_links_2', 'negativity4', 'feat19', 'feat21', 'feat20',
       'positivity4', 'meta_length', 'positivity5', 'videos', 'feat5',
       'topic_5', 'feat1', 'day_of_week_sunday', 'day_of_week_thursday',
       'day_of_week_tuesday', 'day_of_week_wednesday', 'topic_1',
       'topic_2', 'topic_3', 'topic_4', 'topic_6', 'day_of_week_friday',
       'day_of_week_monday', 'day_of_week_saturday',
    

In [40]:
n_features = 5
top_n_features = importances.index.values[:n_features].tolist()

In [42]:
# Import functionality for cloning a model
from sklearn.base import clone

# Reduce the feature space
X_train_reduced = X_train[top_n_features]
X_test_reduced = X_test[top_n_features]

# Train on the "best" model found from grid search earlier
clf = (clone(best_clf)).fit(X_train_reduced, y_train)

# Make new predictions
reduced_predictions = clf.predict(X_test_reduced)

# Report scores from the final model using both versions of data
print("Final Model trained on full data\n------")
print("ROC AUC score on testing data: {:.4f}".format(roc_auc_score(y_test, predictions)))
print("\nFinal Model trained on reduced data\n------")
print("ROC AUC score on testing data: {:.4f}".format(roc_auc_score(y_test, reduced_predictions)))

Final Model trained on full data
------
ROC AUC score on testing data: 0.6732

Final Model trained on reduced data
------
ROC AUC score on testing data: 0.6976


In [89]:
#predictions = clf.predict(test_features[top_n_features])
test_features = pca.transform(test_features)
predictions = best_clf.predict(test_features)

In [90]:
np.savetxt("output.csv", np.asarray(predictions), delimiter=",")

In [35]:
# Check if these 10 features work good on any other model
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier()
clf.fit(X_train_reduced, y_train)

y_pred = clf.predict(X_test_reduced)

In [36]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred)

0.5021494491006686