## This notebook pre-processed data and trains different machine learning models

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk

In [2]:
dataset = pd.read_csv('morefeaturesDataset.csv')
dataset = dataset.drop(columns=['ID'])
dataset.head()

Unnamed: 0,Title,Url,Content,Comments,Flair
0,A polite request to all Indians here,https://www.reddit.com/r/india/comments/g2ct57...,I don't know if it is the same situation in ot...,Our society thrives on abuse of power. We let...,Politics
1,Pitting a community against a political party ...,https://www.reddit.com/r/india/comments/futac9...,First of all let me start by saying it was stu...,Our country is just too far in at the moment ...,Politics
2,A new political party gave a full front page a...,https://i.redd.it/yjo9wpy38el41.jpg,,This looks like an IIPM ad 1. Where did they ...,Politics
3,Hit by backlash over posts on lack of medical ...,https://theprint.in/india/hit-by-backlash-over...,,"Well, Some people really deserve to die. ~~/s...",Politics
4,Politics in the time of corona: WB CM question...,https://www.timesnownews.com/india/article/pol...,,"Oh FFS. \n\nYellow, Orange, Green, Red, all a...",Politics


## Text pre-processing

In [3]:
# Removing HTML syntaxes
from bs4 import BeautifulSoup as bs

# removing punctuation marks
import re

import nltk

# Removing stop-words
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shreya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
def transformText(rawText):
    """
    This function takes a raw string review as input, and applies the following steps to return a refined string review as output:
    1. removing HTML tags
    2. removing punctuation marks
    3. converting the text to lower case
    4. splitting the string into words
    5. removing stop words
    """
    # remove HTML tags
    noHtml = bs(rawText, "lxml").get_text()
    
    #remove punctuation marks
    letters_only = re.sub("[^a-zA-Z ]", "", noHtml)
    
    #convert to lower case
    tolower = letters_only.lower()
    
    #split
    words = tolower.split()
    
    #convert stopwords list to set for fast searching
    stopwordsSet = set(stopwords.words("english"))
    
    #remove stop words
    RefinedWords = [w for w in words if w not in stopwordsSet]
    
    #form new review
    return(" ".join(RefinedWords) )

### Transforming textual features

In [5]:
# 1. Title
dataset['Title'] = dataset['Title'].apply(transformText)

# 2. Content
#Since content contains NaN values, applying transformation to selected posts only
dataset['Content'] = dataset['Content'].apply(lambda x: transformText(x) if(pd.notnull(x)) else x) 
dataset.head()

# 3. Comments
#since comments can also contain null, we use the same method as used for content
dataset['Comments'] = dataset['Comments'].apply(lambda x: transformText(x) if(pd.notnull(x)) else x) 

dataset.head()

Unnamed: 0,Title,Url,Content,Comments,Flair
0,polite request indians,https://www.reddit.com/r/india/comments/g2ct57...,dont know situation countries india seen lot o...,society thrives abuse power let many idiots ab...,Politics
1,pitting community political party fucking stupid,https://www.reddit.com/r/india/comments/futac9...,first let start saying stupid whatever muslims...,country far moment theres turning back best ho...,Politics
2,new political party gave full front page ad po...,https://i.redd.it/yjo9wpy38el41.jpg,,looks like iipm ad get funds full page ads use...,Politics
3,hit backlash posts lack medical gear doctors g...,https://theprint.in/india/hit-by-backlash-over...,,well people really deserve die country fucking...,Politics
4,politics time corona wb cm questions centres c...,https://www.timesnownews.com/india/article/pol...,,oh ffs yellow orange green red used emergency ...,Politics


## Building Model Functions

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix

### Naive Bayes Classifier

In [7]:
def nb_classifier(X_train, X_test, y_train, y_test):
  
    from sklearn.naive_bayes import MultinomialNB


    nb = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('clf', MultinomialNB()),
                ], verbose=True)
    nb.fit(X_train, y_train)

    y_pred = nb.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred))

### Linear SVM

In [8]:
def linear_svm(X_train, X_test, y_train, y_test):
    from sklearn.svm import LinearSVC
    
    svm = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('clf', LinearSVC()),
                ], verbose=True)
    svm.fit(X_train, y_train)
    
    y_pred = svm.predict(X_test)
    
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred))

### Logistic Regression

In [9]:
def logistic_regression(X_train, X_test, y_train, y_test):
    from sklearn.linear_model import LogisticRegression
    
    log_reg = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('clf', LogisticRegression()),
                ], verbose=True)
    log_reg.fit(X_train, y_train)
    
    y_pred = log_reg.predict(X_test)
    
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred))

### Random Forest

In [10]:
def random_forests(X_train, X_test, y_train, y_test):
    from sklearn.ensemble import RandomForestClassifier
    
    rf = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('clf', RandomForestClassifier()),
                ], verbose=True)
    rf.fit(X_train, y_train)
    
    y_pred = rf.predict(X_test)
    
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred))

### Multi layer perceptron

In [11]:
def mlp(X_train, X_test, y_train, y_test):
    
    from sklearn.neural_network import MLPClassifier
    
    mlp = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('clf', MLPClassifier()),
                ], verbose=True)
    mlp.fit(X_train, y_train)
    
    y_pred = mlp.predict(X_test)
    
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred))

## Splitting Data

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X = dataset.Title
y = dataset.Flair

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

print("Original dataset size: ", X.shape)
print("Training dataset size: ", X_train.shape)

Original dataset size:  (1200,)
Training dataset size:  (804,)


## Training Dataset on the models

In [20]:
nb_classifier(X_train, X_test, y_train, y_test)

[Pipeline] .............. (step 1 of 3) Processing vect, total=   0.0s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=   0.0s
accuracy 0.6742424242424242
                    precision    recall  f1-score   support

          AskIndia       0.79      0.62      0.70        37
  Business/Finance       0.39      0.41      0.40        32
       CAA-NRC-NPR       0.75      0.90      0.82        30
       Coronavirus       0.64      0.94      0.76        32
              Food       0.75      0.83      0.79        36
     Non-Political       0.50      0.86      0.63        28
       Photography       0.78      0.85      0.81        33
    Policy/Economy       0.64      0.43      0.52        37
          Politics       0.78      0.51      0.62        35
         Scheduled       0.71      0.48      0.58        31
Science/Technology       0.63      0.59      0.61        32
            Sports       0.86      0.7

In [21]:
linear_svm(X_train, X_test, y_train, y_test)

[Pipeline] .............. (step 1 of 3) Processing vect, total=   0.0s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=   0.0s
accuracy 0.7070707070707071
                    precision    recall  f1-score   support

          AskIndia       0.68      0.68      0.68        37
  Business/Finance       0.35      0.50      0.41        32
       CAA-NRC-NPR       0.84      0.87      0.85        30
       Coronavirus       0.82      0.97      0.89        32
              Food       0.82      0.89      0.85        36
     Non-Political       0.82      0.82      0.82        28
       Photography       0.85      0.85      0.85        33
    Policy/Economy       0.62      0.49      0.55        37
          Politics       0.81      0.60      0.69        35
         Scheduled       0.53      0.52      0.52        31
Science/Technology       0.74      0.62      0.68        32
            Sports       0.75      0.7

In [22]:
logistic_regression(X_train, X_test, y_train, y_test)

[Pipeline] .............. (step 1 of 3) Processing vect, total=   0.0s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=   0.2s
accuracy 0.7070707070707071
                    precision    recall  f1-score   support

          AskIndia       0.66      0.62      0.64        37
  Business/Finance       0.31      0.56      0.40        32
       CAA-NRC-NPR       0.90      0.90      0.90        30
       Coronavirus       0.79      0.97      0.87        32
              Food       0.84      0.86      0.85        36
     Non-Political       0.92      0.82      0.87        28
       Photography       0.87      0.79      0.83        33
    Policy/Economy       0.64      0.43      0.52        37
          Politics       0.85      0.63      0.72        35
         Scheduled       0.56      0.58      0.57        31
Science/Technology       0.79      0.59      0.68        32
            Sports       0.76      0.7

In [23]:
random_forests(X_train, X_test, y_train, y_test)

[Pipeline] .............. (step 1 of 3) Processing vect, total=   0.0s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=   0.3s
accuracy 0.7146464646464646
                    precision    recall  f1-score   support

          AskIndia       0.68      0.57      0.62        37
  Business/Finance       0.32      0.62      0.42        32
       CAA-NRC-NPR       0.90      0.87      0.88        30
       Coronavirus       0.94      0.94      0.94        32
              Food       0.86      0.89      0.88        36
     Non-Political       0.92      0.79      0.85        28
       Photography       0.81      0.88      0.84        33
    Policy/Economy       0.58      0.49      0.53        37
          Politics       0.85      0.66      0.74        35
         Scheduled       0.67      0.52      0.58        31
Science/Technology       0.83      0.59      0.69        32
            Sports       0.69      0.8

In [19]:
mlp(X_train, X_test, y_train, y_test)

[Pipeline] .............. (step 1 of 3) Processing vect, total=   0.0s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=   3.9s
accuracy 0.5580808080808081
                    precision    recall  f1-score   support

          AskIndia       0.53      0.49      0.51        37
  Business/Finance       0.21      0.25      0.23        32
       CAA-NRC-NPR       0.75      0.80      0.77        30
       Coronavirus       0.66      0.66      0.66        32
              Food       0.64      0.69      0.67        36
     Non-Political       0.62      0.57      0.59        28
       Photography       0.76      0.76      0.76        33
    Policy/Economy       0.48      0.41      0.44        37
          Politics       0.52      0.49      0.50        35
         Scheduled       0.37      0.52      0.43        31
Science/Technology       0.67      0.56      0.61        32
            Sports       0.67      0.5