In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.externals import joblib



In [2]:
data = pd.read_csv("Cleaned Reddit Data.csv")
data.Title = data.Title.astype(str)
data.Content = data.Content.astype(str)
data.drop("Unnamed: 0", inplace=True, axis=1)
data.head()

Unnamed: 0,RID,Title,URL,Score,Comment_Score,Author,Content,Adult,Flair,Length_Title,Length_Content
0,g76o5f,The real loser Indias errupting Islamaphobia C...,https://www.reddit.com/r/india/comments/g76o5f...,88,53,HairLikeWinterFire,TLDR My unqualified opinion dalit political mo...,False,Politics,59,4775
1,futac9,Pitting community political party fucking stupid,https://www.reddit.com/r/india/comments/futac9...,194,73,chillinvillain122,First let start saying stupid whatever muslims...,False,Politics,48,1091
2,ff8sth,A new political party gave full front page ad ...,https://i.redd.it/yjo9wpy38el41.jpg,736,146,aaluinsonaout,,False,Politics,75,0
3,fxs1vy,Politics time corona WB CM questions Centres c...,https://www.timesnownews.com/india/article/pol...,83,22,ConcernedCitizen034,,False,Politics,82,0
4,fd7q3z,AAPs woeful response Delhi communal violence r...,https://scroll.in/article/954991/aaps-woeful-r...,213,45,Dumma1729,,False,Politics,76,0


# Taking TITLE as Feature

In [5]:
x = data["Title"]
y = data["Flair"]

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2)

### 1. Random Forest Classifier

In [6]:
model_rforest = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', RandomForestClassifier(n_estimators=300)),
                ])
model_rforest.fit(xtrain, ytrain)

y_pred = model_rforest.predict(xtest)

In [7]:
print(classification_report(ytest, y_pred))
print(accuracy_score(ytest, y_pred))

                    precision    recall  f1-score   support

          AskIndia       0.67      0.94      0.78        31
  Business/Finance       0.00      0.00      0.00         4
       CAA-NRC-NPR       0.00      0.00      0.00         1
       Coronavirus       0.90      0.95      0.92        37
              Food       1.00      0.67      0.80         3
     Non-Political       0.94      0.91      0.93        34
       Photography       1.00      0.56      0.71         9
    Policy/Economy       0.67      0.46      0.55        13
          Politics       0.74      0.78      0.76        40
         Scheduled       1.00      1.00      1.00         8
Science/Technology       0.75      0.75      0.75         4
            Sports       1.00      0.78      0.88         9

          accuracy                           0.81       193
         macro avg       0.72      0.65      0.67       193
      weighted avg       0.81      0.81      0.80       193

0.8134715025906736


  'precision', 'predicted', average, warn_for)


### 2. Multinomial Naive Bayes Classifier

In [8]:
model_nb = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB()),
                ])
model_nb.fit(xtrain, ytrain)

y_pred = model_nb.predict(xtest)

In [9]:
print(classification_report(ytest, y_pred))
print(accuracy_score(ytest, y_pred))

                    precision    recall  f1-score   support

          AskIndia       0.44      1.00      0.61        31
  Business/Finance       0.00      0.00      0.00         4
       CAA-NRC-NPR       0.00      0.00      0.00         1
       Coronavirus       0.67      0.97      0.79        37
              Food       0.00      0.00      0.00         3
     Non-Political       0.82      0.91      0.86        34
       Photography       0.00      0.00      0.00         9
    Policy/Economy       0.00      0.00      0.00        13
          Politics       0.92      0.55      0.69        40
         Scheduled       1.00      0.88      0.93         8
Science/Technology       0.00      0.00      0.00         4
            Sports       0.00      0.00      0.00         9

          accuracy                           0.66       193
         macro avg       0.32      0.36      0.32       193
      weighted avg       0.57      0.66      0.58       193

0.6580310880829016


  'precision', 'predicted', average, warn_for)


### 3. Stochastic Gradient Descent Classifier

In [10]:
model_sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier()),
                ])
model_sgd.fit(xtrain, ytrain)

y_pred = model_sgd.predict(xtest)

In [11]:
print(classification_report(ytest, y_pred))
print(accuracy_score(ytest, y_pred))

                    precision    recall  f1-score   support

          AskIndia       0.82      0.90      0.86        31
  Business/Finance       0.00      0.00      0.00         4
       CAA-NRC-NPR       0.00      0.00      0.00         1
       Coronavirus       0.92      0.95      0.93        37
              Food       1.00      0.67      0.80         3
     Non-Political       0.91      0.94      0.93        34
       Photography       1.00      0.56      0.71         9
    Policy/Economy       0.50      0.54      0.52        13
          Politics       0.90      0.88      0.89        40
         Scheduled       1.00      1.00      1.00         8
Science/Technology       0.75      0.75      0.75         4
            Sports       0.82      1.00      0.90         9

          accuracy                           0.85       193
         macro avg       0.72      0.68      0.69       193
      weighted avg       0.85      0.85      0.84       193

0.8497409326424871


### 4. Multi Layer Perceptron Classifier

In [12]:
model_mlp = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', MLPClassifier()),
                ])
model_mlp.fit(xtrain, ytrain)

y_pred = model_mlp.predict(xtest)

In [13]:
print(classification_report(ytest, y_pred))
print(accuracy_score(ytest, y_pred))

                    precision    recall  f1-score   support

          AskIndia       0.51      0.87      0.64        31
  Business/Finance       0.00      0.00      0.00         4
       CAA-NRC-NPR       0.00      0.00      0.00         1
       Coronavirus       0.79      0.89      0.84        37
              Food       0.00      0.00      0.00         3
     Non-Political       0.74      0.74      0.74        34
       Photography       1.00      0.33      0.50         9
    Policy/Economy       0.45      0.38      0.42        13
          Politics       0.84      0.65      0.73        40
         Scheduled       1.00      0.88      0.93         8
Science/Technology       0.75      0.75      0.75         4
            Sports       0.86      0.67      0.75         9

          accuracy                           0.70       193
         macro avg       0.58      0.51      0.52       193
      weighted avg       0.71      0.70      0.69       193

0.6994818652849741


  'precision', 'predicted', average, warn_for)


### 5. Logistic Regression

In [14]:
model_log = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression()),
                ])
model_log.fit(xtrain, ytrain)

y_pred = model_log.predict(xtest)



In [15]:
print(classification_report(ytest, y_pred))
print(accuracy_score(ytest, y_pred))

                    precision    recall  f1-score   support

          AskIndia       0.53      1.00      0.70        31
  Business/Finance       0.00      0.00      0.00         4
       CAA-NRC-NPR       0.00      0.00      0.00         1
       Coronavirus       0.74      0.95      0.83        37
              Food       0.00      0.00      0.00         3
     Non-Political       0.91      0.94      0.93        34
       Photography       1.00      0.33      0.50         9
    Policy/Economy       1.00      0.08      0.14        13
          Politics       0.89      0.78      0.83        40
         Scheduled       1.00      1.00      1.00         8
Science/Technology       0.00      0.00      0.00         4
            Sports       1.00      0.67      0.80         9

          accuracy                           0.76       193
         macro avg       0.59      0.48      0.48       193
      weighted avg       0.78      0.76      0.72       193

0.7616580310880829


### 6. XGBoost Classifier

In [16]:
model_xgb = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', XGBClassifier()),
                ])
model_xgb.fit(xtrain, ytrain)

y_pred = model_xgb.predict(xtest)

In [17]:
print(classification_report(ytest, y_pred))
print(accuracy_score(ytest, y_pred))

                    precision    recall  f1-score   support

          AskIndia       0.79      0.87      0.83        31
  Business/Finance       0.00      0.00      0.00         4
       CAA-NRC-NPR       0.00      0.00      0.00         1
       Coronavirus       0.97      0.95      0.96        37
              Food       1.00      0.67      0.80         3
     Non-Political       0.97      0.85      0.91        34
       Photography       1.00      0.56      0.71         9
    Policy/Economy       0.58      0.54      0.56        13
          Politics       0.63      0.80      0.70        40
         Scheduled       1.00      1.00      1.00         8
Science/Technology       0.75      0.75      0.75         4
            Sports       0.64      0.78      0.70         9

          accuracy                           0.80       193
         macro avg       0.69      0.65      0.66       193
      weighted avg       0.80      0.80      0.80       193

0.8031088082901554


### 7. Linear Support Vector Machine Classifier

In [18]:
model_linearsvc = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LinearSVC()),
                ])
model_linearsvc.fit(xtrain, ytrain)

y_pred = model_linearsvc.predict(xtest)

In [19]:
print(classification_report(ytest, y_pred))
print(accuracy_score(ytest, y_pred))

                    precision    recall  f1-score   support

          AskIndia       0.77      0.97      0.86        31
  Business/Finance       0.00      0.00      0.00         4
       CAA-NRC-NPR       0.00      0.00      0.00         1
       Coronavirus       0.95      0.95      0.95        37
              Food       1.00      0.67      0.80         3
     Non-Political       0.92      0.97      0.94        34
       Photography       1.00      0.56      0.71         9
    Policy/Economy       0.64      0.54      0.58        13
          Politics       0.92      0.88      0.90        40
         Scheduled       1.00      1.00      1.00         8
Science/Technology       0.75      0.75      0.75         4
            Sports       0.82      1.00      0.90         9

          accuracy                           0.87       193
         macro avg       0.73      0.69      0.70       193
      weighted avg       0.86      0.87      0.86       193

0.8652849740932642


# Taking Content as Feature

In [20]:
x = data["Content"]
y = data["Flair"]

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2)

### 1. Random Forest Classifier

In [21]:
model_rforest = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', RandomForestClassifier(n_estimators=300)),
                ])
model_rforest.fit(xtrain, ytrain)

y_pred = model_rforest.predict(xtest)

In [22]:
print(classification_report(ytest, y_pred))
print(accuracy_score(ytest, y_pred))

                    precision    recall  f1-score   support

          AskIndia       0.48      0.91      0.63        35
  Business/Finance       0.00      0.00      0.00         6
       CAA-NRC-NPR       0.00      0.00      0.00         2
       Coronavirus       0.32      1.00      0.48        34
              Food       0.00      0.00      0.00         6
     Non-Political       0.40      0.06      0.11        31
       Photography       0.00      0.00      0.00         4
    Policy/Economy       0.50      0.27      0.35        11
          Politics       0.00      0.00      0.00        34
         Scheduled       1.00      0.80      0.89        10
Science/Technology       0.00      0.00      0.00         5
            Sports       0.00      0.00      0.00        15

          accuracy                           0.41       193
         macro avg       0.22      0.25      0.21       193
      weighted avg       0.29      0.41      0.28       193

0.40932642487046633


  'precision', 'predicted', average, warn_for)


### 2. Multinomial Naive Bayes Classifier

In [23]:
model_nb = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB()),
                ])
model_nb.fit(xtrain, ytrain)

y_pred = model_nb.predict(xtest)

In [24]:
print(classification_report(ytest, y_pred))
print(accuracy_score(ytest, y_pred))

                    precision    recall  f1-score   support

          AskIndia       0.45      0.97      0.61        35
  Business/Finance       0.00      0.00      0.00         6
       CAA-NRC-NPR       0.00      0.00      0.00         2
       Coronavirus       0.31      1.00      0.48        34
              Food       0.00      0.00      0.00         6
     Non-Political       0.00      0.00      0.00        31
       Photography       0.00      0.00      0.00         4
    Policy/Economy       0.00      0.00      0.00        11
          Politics       0.00      0.00      0.00        34
         Scheduled       1.00      0.80      0.89        10
Science/Technology       0.00      0.00      0.00         5
            Sports       0.00      0.00      0.00        15

          accuracy                           0.39       193
         macro avg       0.15      0.23      0.16       193
      weighted avg       0.19      0.39      0.24       193

0.39378238341968913


  'precision', 'predicted', average, warn_for)


### 3. Stochastic Gradient Descent Classifier

In [25]:
model_sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier()),
                ])
model_sgd.fit(xtrain, ytrain)

y_pred = model_sgd.predict(xtest)

In [26]:
print(classification_report(ytest, y_pred))
print(accuracy_score(ytest, y_pred))

                    precision    recall  f1-score   support

          AskIndia       0.60      0.83      0.70        35
  Business/Finance       0.00      0.00      0.00         6
       CAA-NRC-NPR       0.02      1.00      0.04         2
       Coronavirus       1.00      0.03      0.06        34
              Food       0.00      0.00      0.00         6
     Non-Political       0.38      0.10      0.15        31
       Photography       0.00      0.00      0.00         4
    Policy/Economy       0.55      0.55      0.55        11
          Politics       0.38      0.09      0.14        34
         Scheduled       1.00      0.90      0.95        10
Science/Technology       0.00      0.00      0.00         5
            Sports       1.00      0.13      0.24        15

          accuracy                           0.28       193
         macro avg       0.41      0.30      0.23       193
      weighted avg       0.57      0.28      0.29       193

0.2849740932642487


### 4. Multi Layer Perceptron Classifier

In [27]:
model_mlp = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', MLPClassifier()),
                ])
model_mlp.fit(xtrain, ytrain)

y_pred = model_mlp.predict(xtest)

In [28]:
print(classification_report(ytest, y_pred))
print(accuracy_score(ytest, y_pred))

                    precision    recall  f1-score   support

          AskIndia       0.54      0.89      0.67        35
  Business/Finance       0.00      0.00      0.00         6
       CAA-NRC-NPR       0.00      0.00      0.00         2
       Coronavirus       0.31      1.00      0.48        34
              Food       0.00      0.00      0.00         6
     Non-Political       0.50      0.13      0.21        31
       Photography       0.00      0.00      0.00         4
    Policy/Economy       0.50      0.27      0.35        11
          Politics       0.60      0.09      0.15        34
         Scheduled       1.00      0.80      0.89        10
Science/Technology       0.00      0.00      0.00         5
            Sports       1.00      0.07      0.12        15

          accuracy                           0.44       193
         macro avg       0.37      0.27      0.24       193
      weighted avg       0.50      0.44      0.34       193

0.43523316062176165


  'precision', 'predicted', average, warn_for)


### 5. Logistic Regression

In [29]:
model_log = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression()),
                ])
model_log.fit(xtrain, ytrain)

y_pred = model_log.predict(xtest)



In [30]:
print(classification_report(ytest, y_pred))
print(accuracy_score(ytest, y_pred))

                    precision    recall  f1-score   support

          AskIndia       0.42      0.97      0.59        35
  Business/Finance       0.00      0.00      0.00         6
       CAA-NRC-NPR       0.00      0.00      0.00         2
       Coronavirus       0.31      0.97      0.47        34
              Food       0.00      0.00      0.00         6
     Non-Political       0.00      0.00      0.00        31
       Photography       0.00      0.00      0.00         4
    Policy/Economy       0.00      0.00      0.00        11
          Politics       0.00      0.00      0.00        34
         Scheduled       1.00      0.70      0.82        10
Science/Technology       0.00      0.00      0.00         5
            Sports       0.00      0.00      0.00        15

          accuracy                           0.38       193
         macro avg       0.14      0.22      0.16       193
      weighted avg       0.18      0.38      0.23       193

0.38341968911917096


### 6. XGBoost Classifier

In [31]:
model_xgb = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', XGBClassifier()),
                ])
model_xgb.fit(xtrain, ytrain)

y_pred = model_xgb.predict(xtest)

In [32]:
print(classification_report(ytest, y_pred))
print(accuracy_score(ytest, y_pred))

                    precision    recall  f1-score   support

          AskIndia       0.54      0.77      0.64        35
  Business/Finance       0.00      0.00      0.00         6
       CAA-NRC-NPR       0.00      0.00      0.00         2
       Coronavirus       0.32      1.00      0.48        34
              Food       0.00      0.00      0.00         6
     Non-Political       0.40      0.13      0.20        31
       Photography       0.00      0.00      0.00         4
    Policy/Economy       0.62      0.45      0.53        11
          Politics       0.50      0.06      0.11        34
         Scheduled       1.00      0.90      0.95        10
Science/Technology       0.67      0.40      0.50         5
            Sports       1.00      0.13      0.24        15

          accuracy                           0.44       193
         macro avg       0.42      0.32      0.30       193
      weighted avg       0.49      0.44      0.36       193

0.44041450777202074


### 7. Linear Support Vector Machine Classifier

In [33]:
model_linearsvc = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LinearSVC()),
                ])
model_linearsvc.fit(xtrain, ytrain)

y_pred = model_linearsvc.predict(xtest)

In [34]:
print(classification_report(ytest, y_pred))
print(accuracy_score(ytest, y_pred))

                    precision    recall  f1-score   support

          AskIndia       0.57      0.89      0.70        35
  Business/Finance       0.00      0.00      0.00         6
       CAA-NRC-NPR       0.00      0.00      0.00         2
       Coronavirus       0.32      1.00      0.48        34
              Food       0.00      0.00      0.00         6
     Non-Political       0.29      0.06      0.11        31
       Photography       0.00      0.00      0.00         4
    Policy/Economy       0.60      0.55      0.57        11
          Politics       0.75      0.09      0.16        34
         Scheduled       1.00      0.90      0.95        10
Science/Technology       0.00      0.00      0.00         5
            Sports       1.00      0.13      0.24        15

          accuracy                           0.45       193
         macro avg       0.38      0.30      0.27       193
      weighted avg       0.50      0.45      0.36       193

0.45077720207253885


# Taking TITLE + CONTENT + URL as Feature

In [35]:
x = data["Title"] + data["Content"] + data["URL"]
y = data["Flair"]

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2)

### 1. Random Forest Classifier

In [36]:
model_rforest = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', RandomForestClassifier(n_estimators=300)),
                ])
model_rforest.fit(xtrain, ytrain)

y_pred = model_rforest.predict(xtest)

In [37]:
print(classification_report(ytest, y_pred))
print(accuracy_score(ytest, y_pred))

                    precision    recall  f1-score   support

          AskIndia       0.67      0.97      0.79        32
  Business/Finance       0.00      0.00      0.00         1
       CAA-NRC-NPR       0.00      0.00      0.00         2
       Coronavirus       0.85      0.98      0.91        45
              Food       0.00      0.00      0.00         3
     Non-Political       0.89      0.82      0.85        39
       Photography       0.67      0.33      0.44         6
    Policy/Economy       1.00      0.88      0.93         8
          Politics       0.79      0.74      0.77        31
         Scheduled       1.00      0.89      0.94         9
Science/Technology       1.00      0.33      0.50         3
            Sports       0.91      0.71      0.80        14

          accuracy                           0.82       193
         macro avg       0.65      0.55      0.58       193
      weighted avg       0.81      0.82      0.80       193

0.8186528497409327


  'precision', 'predicted', average, warn_for)


### 2. Multinomial Naive Bayes Classifier

In [38]:
model_nb = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB()),
                ])
model_nb.fit(xtrain, ytrain)

y_pred = model_nb.predict(xtest)

In [39]:
print(classification_report(ytest, y_pred))
print(accuracy_score(ytest, y_pred))

                    precision    recall  f1-score   support

          AskIndia       0.44      1.00      0.62        32
  Business/Finance       0.00      0.00      0.00         1
       CAA-NRC-NPR       0.00      0.00      0.00         2
       Coronavirus       0.75      0.98      0.85        45
              Food       0.00      0.00      0.00         3
     Non-Political       0.68      0.54      0.60        39
       Photography       0.00      0.00      0.00         6
    Policy/Economy       0.00      0.00      0.00         8
          Politics       0.88      0.68      0.76        31
         Scheduled       1.00      0.78      0.88         9
Science/Technology       0.00      0.00      0.00         3
            Sports       0.00      0.00      0.00        14

          accuracy                           0.65       193
         macro avg       0.31      0.33      0.31       193
      weighted avg       0.57      0.65      0.58       193

0.6476683937823834


  'precision', 'predicted', average, warn_for)


### 3. Stochastic Gradient Descent Classifier

In [40]:
model_sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier()),
                ])
model_sgd.fit(xtrain, ytrain)

y_pred = model_sgd.predict(xtest)

In [41]:
print(classification_report(ytest, y_pred))
print(accuracy_score(ytest, y_pred))

                    precision    recall  f1-score   support

          AskIndia       0.81      0.91      0.85        32
  Business/Finance       1.00      1.00      1.00         1
       CAA-NRC-NPR       1.00      0.50      0.67         2
       Coronavirus       0.96      1.00      0.98        45
              Food       1.00      0.67      0.80         3
     Non-Political       0.94      0.79      0.86        39
       Photography       0.80      0.67      0.73         6
    Policy/Economy       0.88      0.88      0.88         8
          Politics       0.87      0.87      0.87        31
         Scheduled       1.00      0.89      0.94         9
Science/Technology       0.67      0.67      0.67         3
            Sports       0.72      0.93      0.81        14

          accuracy                           0.88       193
         macro avg       0.89      0.81      0.84       193
      weighted avg       0.89      0.88      0.88       193

0.8808290155440415


### 4. Multi Layer Perceptron Classifier

In [42]:
model_mlp = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', MLPClassifier()),
                ])
model_mlp.fit(xtrain, ytrain)

y_pred = model_mlp.predict(xtest)

In [43]:
print(classification_report(ytest, y_pred))
print(accuracy_score(ytest, y_pred))

                    precision    recall  f1-score   support

          AskIndia       0.60      0.88      0.71        32
  Business/Finance       1.00      1.00      1.00         1
       CAA-NRC-NPR       0.00      0.00      0.00         2
       Coronavirus       0.80      1.00      0.89        45
              Food       0.00      0.00      0.00         3
     Non-Political       0.73      0.62      0.67        39
       Photography       1.00      0.33      0.50         6
    Policy/Economy       1.00      0.75      0.86         8
          Politics       0.82      0.74      0.78        31
         Scheduled       1.00      0.89      0.94         9
Science/Technology       0.50      0.33      0.40         3
            Sports       1.00      0.71      0.83        14

          accuracy                           0.77       193
         macro avg       0.70      0.60      0.63       193
      weighted avg       0.77      0.77      0.75       193

0.7668393782383419


  'precision', 'predicted', average, warn_for)


### 5. Logistic Regression

In [44]:
model_log = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression()),
                ])
model_log.fit(xtrain, ytrain)

y_pred = model_log.predict(xtest)



In [45]:
print(classification_report(ytest, y_pred))
print(accuracy_score(ytest, y_pred))

                    precision    recall  f1-score   support

          AskIndia       0.57      1.00      0.73        32
  Business/Finance       0.00      0.00      0.00         1
       CAA-NRC-NPR       0.00      0.00      0.00         2
       Coronavirus       0.88      1.00      0.94        45
              Food       0.00      0.00      0.00         3
     Non-Political       0.76      0.64      0.69        39
       Photography       0.50      0.17      0.25         6
    Policy/Economy       1.00      0.38      0.55         8
          Politics       0.83      0.81      0.82        31
         Scheduled       1.00      0.78      0.88         9
Science/Technology       1.00      0.33      0.50         3
            Sports       1.00      0.71      0.83        14

          accuracy                           0.77       193
         macro avg       0.63      0.48      0.52       193
      weighted avg       0.78      0.77      0.75       193

0.772020725388601


### 6. XGBoost Classifier

In [46]:
model_xgb = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', XGBClassifier()),
                ])
model_xgb.fit(xtrain, ytrain)

y_pred = model_xgb.predict(xtest)

In [47]:
print(classification_report(ytest, y_pred))
print(accuracy_score(ytest, y_pred))

                    precision    recall  f1-score   support

          AskIndia       0.91      0.94      0.92        32
  Business/Finance       0.00      0.00      0.00         1
       CAA-NRC-NPR       0.00      0.00      0.00         2
       Coronavirus       1.00      1.00      1.00        45
              Food       0.67      0.67      0.67         3
     Non-Political       0.94      0.87      0.91        39
       Photography       0.83      0.83      0.83         6
    Policy/Economy       0.83      0.62      0.71         8
          Politics       0.73      0.87      0.79        31
         Scheduled       1.00      0.89      0.94         9
Science/Technology       0.50      0.67      0.57         3
            Sports       0.93      0.93      0.93        14

          accuracy                           0.89       193
         macro avg       0.70      0.69      0.69       193
      weighted avg       0.88      0.89      0.88       193

0.8860103626943006


### 7. Linear Support Vector Machine Classifier

In [48]:
model_linearsvc = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LinearSVC()),
                ])
model_linearsvc.fit(xtrain, ytrain)

y_pred = model_linearsvc.predict(xtest)

In [49]:
print(classification_report(ytest, y_pred))
print(accuracy_score(ytest, y_pred))

                    precision    recall  f1-score   support

          AskIndia       0.79      0.97      0.87        32
  Business/Finance       1.00      1.00      1.00         1
       CAA-NRC-NPR       0.00      0.00      0.00         2
       Coronavirus       0.94      1.00      0.97        45
              Food       1.00      0.67      0.80         3
     Non-Political       0.94      0.79      0.86        39
       Photography       0.62      0.83      0.71         6
    Policy/Economy       1.00      0.88      0.93         8
          Politics       0.87      0.87      0.87        31
         Scheduled       1.00      0.89      0.94         9
Science/Technology       0.67      0.67      0.67         3
            Sports       0.92      0.86      0.89        14

          accuracy                           0.89       193
         macro avg       0.81      0.79      0.79       193
      weighted avg       0.89      0.89      0.88       193

0.8860103626943006


Looking at the Accuracy Score and Classification Report of all the Models Trained above including three different types of features which are TITLE, CONTENT and TITLE + CONTENT + URL I found out that the model which has performed best based on the criteria described above is the Linear Support Vector Machine with Using TITLE + CONTENT + URL as Feature. So I am dumping that model for deployment purpose.

# Dumping The Model

In [52]:
x = data['Title'] + data['Content'] + data['URL']
y = data['Flair']

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2)

model_linearsvc = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LinearSVC()),
                ])

final_model = model_linearsvc.fit(xtrain, ytrain)

In [53]:
joblib.dump(final_model, open('production_model.pkl', 'wb'))

# Loading and Checking The Model Performance On Random Data

In [54]:
model = joblib.load('production_model.pkl')

In [73]:
import nltk
import re
import praw
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words("english"))

client_id = "zMvnVRM8MN_MRw"
client_secret = "-JIqik-pmfgku1BVB_2gcoxnMwM"
user_agent = "FlareReddit"
username = "psatreddit"
password = "pankaj123"

reddit = praw.Reddit(client_id = client_id, client_secret = client_secret, user_agent = user_agent, username = username, password = password)

def clean_text(text):
    text = re.sub('[/(){}\[\]#\|@,"".?'':;*!$]', '', text)
    text = re.sub('[^0-9A-Za-z #+_]', '', text)
    text = str(" ".join([w for w in word_tokenize(text) if w not in stop_words]))
    return text

def prediction(url):
    submission = reddit.submission(url = url)
    data['Title'] = str(submission.title)
    data['Content'] = str(submission.selftext)
    data['URL'] = str(submission.url)
    
    data['Title'] = clean_text(str(data['Title']))
    data['Content'] = clean_text(str(data['Content']))
    
    combined = data['Title'] + data['Content'] + data['URL']
    return model.predict(combined)[0]

In [74]:
prediction("https://www.reddit.com/r/india/comments/d1m9ld/iran_removes_antiindia_banners_from_pak_consulate/")

'AskIndia'