# Import statements

In [13]:
#loading the file in dataframe
import pandas as pd
df = pd.read_csv("newdatasetfinal.csv")
df.head()

Unnamed: 0,title,NewCategory
0,Bitcoin is down 60 percent this year. Here's w...,Business & Finance
1,6 health problems marijuana could treat better...,Crime
2,9 charts that explain the history of global we...,Business & Finance
3,Remember when legal marijuana was going to sen...,Crime
4,Obamacare succeeded for one simple reason: it'...,Technology & Health


In [14]:
#creating category_id feature for NewCategory
from io import StringIO
col = ['NewCategory', 'title']
df = df[col]
df = df[pd.notnull(df['title'])]
df.columns = ['NewCategory', 'title']
df['category_id'] = df['NewCategory'].factorize()[0]
category_id_df = df[['NewCategory', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'NewCategory']].values)
df.head()

Unnamed: 0,NewCategory,title,category_id
0,Business & Finance,Bitcoin is down 60 percent this year. Here's w...,0
1,Crime,6 health problems marijuana could treat better...,1
2,Business & Finance,9 charts that explain the history of global we...,0
3,Crime,Remember when legal marijuana was going to sen...,1
4,Technology & Health,Obamacare succeeded for one simple reason: it'...,2


# Creating new features

In [15]:
#creating numeric features for title using TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df.title).toarray()
labels = df.category_id
features.shape

(16798, 5527)

In [16]:
#Terms in the form of unigram and bigram that are most correlated to each category
from sklearn.feature_selection import chi2
import numpy as np
N = 2
for title, category in sorted(category_to_id.items()):
    features_chi2 = chi2(features, labels == category)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}':".format(category))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

# '0':
  . Most correlated unigrams:
. bank
. stocks
  . Most correlated bigrams:
. men wearhouse
. jos bank
# '1':
  . Most correlated unigrams:
. ferguson
. police
  . Most correlated bigrams:
. sex marriage
. supreme court
# '4':
  . Most correlated unigrams:
. justin
. bieber
  . Most correlated bigrams:
. game thrones
. justin bieber
# '3':
  . Most correlated unigrams:
. donald
. trump
  . Most correlated bigrams:
. hillary clinton
. donald trump
# '2':
  . Most correlated unigrams:
. snowden
. titanfall
  . Most correlated bigrams:
. flappy bird
. edward snowden


# Spiltting the Data into Training and Testing

In [17]:
#Training and Testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features,labels, test_size = 0.2, random_state = 0)

# Training Decision Tree Classifier

In [57]:
#Training the Decision Tree Model for Classification
from sklearn.tree import DecisionTreeClassifier  
classifier = DecisionTreeClassifier( random_state = 0,max_depth=50, min_samples_leaf=1)  
classifier.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=50,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [58]:
y_pred = classifier.predict(X_test)

In [62]:
#Finding the accuracy of Model
from sklearn.metrics import accuracy_score 
acc = accuracy_score(y_test, y_pred)
print("Accuracy of Decision tree model:", acc*100)

Accuracy of Decision tree model: 60.08928571428571


# Hyperparameter Tuning

In [64]:
#Max Depth= 500
classifier = DecisionTreeClassifier( splitter ='best',
            random_state = 0,max_depth=500, min_samples_leaf=1)  
classifier.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=500,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [65]:
y_pred = classifier.predict(X_test)

In [66]:
acc = accuracy_score(y_test, y_pred)
print("Accuracy of Decision tree model after tuning:", acc*100)

Accuracy of Decision tree model after tuning: 77.97619047619048


In [89]:
#Max Depth= 800
classifier = DecisionTreeClassifier( splitter ='best',
            random_state = 0,max_depth=800, min_samples_leaf=1)  
classifier.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=800,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [90]:
y_pred = classifier.predict(X_test)

# Performance Metric

1. Accuracy

In [91]:
acc = accuracy_score(y_test, y_pred)
print("Accuracy of Decision tree model after tuning:", acc*100)

Accuracy of Decision tree model after tuning: 78.45238095238095


2. F1 Score

In [97]:
from sklearn.metrics import f1_score
print("F1 Score for all the categories:")
f1_score(y_test, y_pred, average=None)

F1 Score for all the categories:


array([0.79414634, 0.6614786 , 0.7933635 , 0.83033419, 0.75791139])

3. Confusion Matrix

In [94]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[407,  11,  35,  40,  33],
       [ 11, 255,  22,  56,  46],
       [ 26,  21, 526,  44,  43],
       [ 33,  71,  48, 969,  65],
       [ 22,  23,  35,  39, 479]], dtype=int64)

4. Precision-Recall

In [96]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.77      0.79       526
           1       0.67      0.65      0.66       390
           2       0.79      0.80      0.79       660
           3       0.84      0.82      0.83      1186
           4       0.72      0.80      0.76       598

   micro avg       0.78      0.78      0.78      3360
   macro avg       0.77      0.77      0.77      3360
weighted avg       0.79      0.78      0.78      3360



References
https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
https://machinelearningmastery.com/implement-decision-tree-algorithm-scratch-python/
https://www.datacamp.com/community/tutorials/decision-tree-classification-python