# Data Processing & Analysis

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk # library to work with human language data. We need it to process "title" column
import re
import pickle

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [35]:
path = "/Users/shivimalhotra/Documents/Eluvio/Eluvio_DS_Challenge.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,time_created,date_created,up_votes,down_votes,title,over_18,author,category
0,1201232046,1/25/08,3,0,Scores killed in Pakistan clashes,False,polar,worldnews
1,1201232075,1/25/08,2,0,Japan resumes refuelling mission,False,polar,worldnews
2,1201232523,1/25/08,3,0,US presses Egypt on Gaza border,False,polar,worldnews
3,1201233290,1/25/08,1,0,Jump-start economy: Give health care to all,False,fadi420,worldnews
4,1201274720,1/25/08,4,0,Council of Europe bashes EU&UN terror blacklist,False,mhermans,worldnews


In [36]:
# counting unique values
n = len(pd.unique(df['category']))
print(n)
len(df['category'])

1


509236

### This means that column "category" has only 1 unique value i.e. "worldnews". So, we can drop that column.

In [37]:
# Let's check the same for down_votes column.
m = len(pd.unique(df['down_votes']))
print(m)
len(df['down_votes'])

1


509236

### Since, we are just analyzing the data, we can drop the columns time_created and date_created.

In [38]:
df = df.drop("category", axis = 1)
df = df.drop("down_votes", axis = 1)
df = df.drop("time_created", axis = 1)
df = df.drop("date_created", axis = 1)

In [18]:
df.head()

Unnamed: 0,up_votes,title,over_18,author
0,3,Scores killed in Pakistan clashes,False,polar
1,2,Japan resumes refuelling mission,False,polar
2,3,US presses Egypt on Gaza border,False,polar
3,1,Jump-start economy: Give health care to all,False,fadi420
4,4,Council of Europe bashes EU&UN terror blacklist,False,mhermans


In [39]:
# Let's check the same for down_votes column.
m = len(pd.unique(df['author']))
print(m)

85807


### Note: This number highlights that we have some some authors who have contributed more than once.

### Let's divide the strings in title column using sentence tokenization.

In [40]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shivimalhotra/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shivimalhotra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [41]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [42]:

# To get the stems of words in a sentence.
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [43]:
# To get the words themself in a sentence.
def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [44]:
#lowercase
title = df.title.str.lower()

In [45]:
# Get full stems and tokens to build vocabulary
def tokenized_stemmed(title):
    totalvocab_stemmed = []
    totalvocab_tokenized = []
    for i in title:
        allwords_stemmed = tokenize_and_stem(i) 
        totalvocab_stemmed.extend(allwords_stemmed) 

        allwords_tokenized = tokenize_only(i)
        totalvocab_tokenized.extend(allwords_tokenized)
    return totalvocab_stemmed, totalvocab_tokenized

In [47]:
totalvocab_stemmed_, totalvocab_tokenized_ = tokenized_stemmed(title)

In [57]:
print(totalvocab_stemmed_)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [58]:
import sklearn.feature_extraction.text as text
stopwords = nltk.corpus.stopwords.words('english')
my_stop_words = text.ENGLISH_STOP_WORDS.union(stopwords)

In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(min_df =10**-3 ,analyzer = 'word', max_features=len(set(totalvocab_stemmed_)), stop_words=my_stop_words, tokenizer=tokenize_and_stem, ngram_range=(1,3))

tfidf_matrix = tfidf_vectorizer.fit_transform(title)

print(tfidf_matrix.shape)



(509236, 1815)


In [60]:
thre = np.quantile(df['up_votes'], 0.8)
y = [1 if i > thre else 0 for i in df['up_votes']]
y = np.array(y)
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, y, test_size = 0.2, shuffle = True, random_state = 42)

# MultinomialNB

In [61]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB()

In [62]:
y_predict = clf.predict(X_test)
clf.score(X_test, y_test)

0.8050624459979577

In [63]:
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.81      1.00      0.89     81988
           1       0.56      0.00      0.00     19860

    accuracy                           0.81    101848
   macro avg       0.68      0.50      0.45    101848
weighted avg       0.76      0.81      0.72    101848



# Logistic Regression

In [71]:
LR = LogisticRegression(C=1.0, tol=0.01, penalty='l1', solver='liblinear')
LR.fit(X_train, y_train)

LogisticRegression(penalty='l1', solver='liblinear', tol=0.01)

In [72]:
y_predict = LR.predict(X_test)
LR.score(X_test, y_test)

0.8061326682900007

In [73]:
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.81      0.99      0.89     81988
           1       0.54      0.04      0.07     19860

    accuracy                           0.81    101848
   macro avg       0.68      0.51      0.48    101848
weighted avg       0.76      0.81      0.73    101848



# Gradient Boost

In [74]:
gbdt = GradientBoostingClassifier()
gbdt.fit(X_train, y_train)

GradientBoostingClassifier()

In [75]:
y_predict = gbdt.predict(X_test)
gbdt.score(X_test, y_test)

0.805415913910926

In [None]:
print(classification_report(y_test, y_predict))

# Random Forest

In [76]:
rfc = RandomForestClassifier(n_jobs = -1, max_features = 'sqrt', n_estimators = 10, oob_score = True)
rfc.fit(X_train, y_train)

  warn("Some inputs do not have OOB scores. "
  decision = (predictions[k] /


RandomForestClassifier(max_features='sqrt', n_estimators=10, n_jobs=-1,
                       oob_score=True)

In [77]:
y_predict = rfc.predict(X_test)
rfc.score(X_test, y_test)

0.7927401618097557

In [78]:
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.81      0.97      0.88     81988
           1       0.30      0.05      0.08     19860

    accuracy                           0.79    101848
   macro avg       0.55      0.51      0.48    101848
weighted avg       0.71      0.79      0.73    101848



# XgBoost

In [84]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [85]:
xgb = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

In [86]:
xgb.fit(X_train, y_train)






XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=1000, n_jobs=4, nthread=4, num_parallel_tree=1,
              random_state=27, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=27, subsample=0.8, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [87]:
y_predict = xgb.predict(X_test)

In [88]:
xgb.score(X_test, y_test)

0.8061523053962768

In [89]:
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.81      0.99      0.89     81988
           1       0.54      0.04      0.08     19860

    accuracy                           0.81    101848
   macro avg       0.67      0.52      0.48    101848
weighted avg       0.76      0.81      0.73    101848



# Result

### 1. The one thing common from all models is that "title" has a strong relation with "up_votes".