<h1>Reddit Flair Predictor</h1>
<h6>A reddit flait predictor made using machine learning algorithms</h6>

<h2>Import Files</h2>

In [None]:
# Modules
import time
import string
import pickle
import numpy as np
import pandas as pd
import requests 
import json
import praw
import pymongo
import dns

# NLP PreProcessors
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Tokenizers, Metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score,classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Models
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeRegressor

<h2>Extract Reddit Data</h2>
<h4>Extracted 6 Months of Submission Data from r/India</h4>
<p>Used push-shift.io API service to collect data</p>

In [None]:
currentTime = int(time.time())
prevTime = currentTime - 86400

dataDictionary = {'author_flair_text' : [],
                  'author_fullname': [],
                  'created_utc' : [],
                  'domain' : [],
                  'is_crosspostable' : [],
                  'is_meta' : [],
                  'is_original_content' : [],
                  'is_reddit_media_domain' : [],
                  'is_robot_indexable' : [],
                  'is_self' : [],
                  'is_video' : [],
                  'post_hint' : [],
                  'link_flair_text' : [],
                  'media_only' : [],
                  'num_comments' : [],
                  'permalink' : [],
                  'score' : [],
                  'selftext' : [],
                  'title' : [],
                  'total_awards_received' : [],
                  'url' : [],
                  'comments' : []
                }

reddit = praw.Reddit(client_id='ID',
                     client_secret='SECRET',
                     user_agent='default')

i = 0
# 190 days data
while i < 190:
    url = 'https://api.pushshift.io/reddit/submission/search/?subreddit=india&filter=author_flair_text,author_fullname,created_utc,domain,id,is_crosspostable,is_meta,is_original_content,is_reddit_media_domain,is_robot_indexable,is_self,is_video,post_hint,link_flair_text,media_only,num_comments,permalink,score,selftext,title,total_awards_received,url&size=500&after='+str(prevTime)+'&before='+str(currentTime)+'&sort=desc'
    req = json.loads(requests.get(url).text)
    for submission in req['data']:
        if('link_flair_text' in submission):
            commentText = ''
            temp = reddit.submission(id=submission['id'])
            temp.comments.replace_more(limit=0)
            comments = temp.comments.list()
            for comment in comments:
                if(comment.is_root):
                    commentText += str(comment.body)+' '
            dataDictionary['author_flair_text'].append(str(submission.setdefault('author_flair_text', 'null')))
            dataDictionary['author_fullname'].append(str(submission.setdefault('author_fullname', 'null')))
            dataDictionary['created_utc'].append(submission.setdefault('created_utc', 0))
            dataDictionary['domain'].append(str(submission.setdefault('domain', 'null')))
            dataDictionary['is_crosspostable'].append(submission.setdefault('is_crosspostable', 'false'))
            dataDictionary['is_meta'].append(submission.setdefault('is_meta', 'false'))
            dataDictionary['is_original_content'].append(submission.setdefault('is_original_content', 'false'))
            dataDictionary['is_reddit_media_domain'].append(submission.setdefault('is_reddit_media_domain', 'false'))
            dataDictionary['is_robot_indexable'].append(submission.setdefault('is_robot_indexable', 'false'))
            dataDictionary['is_self'].append(submission.setdefault('is_self', 'false'))
            dataDictionary['is_video'].append(submission.setdefault('is_video', 'false'))
            dataDictionary['post_hint'].append(str(submission.setdefault('post_hint', 'null')))
            dataDictionary['link_flair_text'].append(str(submission.setdefault('link_flair_text', 'null')))
            dataDictionary['media_only'].append(str(submission.setdefault('media_only', 'null')))
            dataDictionary['num_comments'].append(submission.setdefault('num_comments', 0))
            dataDictionary['permalink'].append(str(submission.setdefault('permalink', 'null')))
            dataDictionary['score'].append(submission.setdefault('score', 0))
            dataDictionary['selftext'].append(str(submission.setdefault('selftext', 'null')))
            dataDictionary['title'].append(str(submission.setdefault('title', 'null')))
            dataDictionary['total_awards_received'].append(submission.setdefault('total_awards_received', 0))
            dataDictionary['url'].append(str(submission.setdefault('url', 'null')))
            dataDictionary['comments'].append(str(commentText))
    currentTime = prevTime
    prevTime-=86400
    i+=1
    print('Done Day', i)

print(set(dataDictionary['link_flair_text']), len(set(dataDictionary['link_flair_text'])))

pandasFrame = pd.DataFrame(dataDictionary)
pandasFrame.to_csv('India_data_6months.csv', index=False)

pickle_out = open("data.pickle","wb")
pickle.dump(dataDictionary, pickle_out)
pickle_out.close()

<h2>Insert data into Mongo Atlas Instance</h2>

In [None]:
client = pymongo.MongoClient("#")
db = client.python
print('Connected')
collection = db.data
data = pd.read_csv('./India_data_6months.csv')
print('Read')
collection.insert_many(data.to_dict('records'))
print('Completed')

<h2>Helper Functions to Pre-Process Data</h2>

In [None]:
def cleanText(inputText):
    if(type(inputText)==float):
        inputText = ''
    inputText = str((inputText.encode('ascii', 'ignore')).decode('utf-8')).lower().split()
    specialChars = string.punctuation.replace('#','').replace('+','').replace('_','')
    table = str.maketrans('', '', specialChars)
    words = [w.translate(table) for w in inputText]
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in words]
    words = ' '.join(stemmed)
    return words

def splitUrl(inputText):
    inputText = inputText.lower().split('/')
    inputText = filter(None, inputText)
    inputText = [x for x in inputText if ((x != 'https:') and (x != 'http:'))]
    inputText[0] = inputText[0].split('.')
    inputText[0] = [x for x in inputText[0] if ((x != 'com') and (x != 'www'))]
    inputText[0] = ' '.join(inputText[0])
    words = ' '.join(inputText)
    return words

def classifyTime(inputText):
    hours = time.localtime(int(inputText)).tm_hour
    if(hours in range(6,12)):
        return 'Morning'
    elif(hours in range(12, 17)):
        return 'Noon'
    elif(hours in range(17, 21)):
        return 'Evening'
    else:
        return 'Night'

<h2>Load the saved data into Pandas Frame for use</h2>

In [None]:
pandasFrame = pd.read_csv('India_data_6months.csv')
# pandasFrame = pd.read_csv('reddit-india-data1.csv')

pandasFrame['created_utc'] = pandasFrame['created_utc'].apply(classifyTime)
pandasFrame['domain'] = pandasFrame['domain'].apply(splitUrl)
pandasFrame['post_hint'] = pandasFrame['post_hint'].apply(cleanText)
pandasFrame['permalink'] = pandasFrame['permalink'].apply(splitUrl)
pandasFrame['selftext'] = pandasFrame['selftext'].apply(cleanText)
pandasFrame['title'] = pandasFrame['title'].apply(cleanText)
pandasFrame['url'] = pandasFrame['url'].apply(splitUrl)
pandasFrame['comments'] = pandasFrame['comments'].apply(cleanText)
pandasFrame = pandasFrame.replace(r'^\s*$', np.nan, regex=True)
pandasFrame = pandasFrame.replace(np.nan, '')

print(pandasFrame.shape)

if(len(set(pandasFrame.isnull().any()))==1):
    print('Data is Not Empty')

Y = pandasFrame['link_flair_text']
factorizedLabels = pd.factorize(Y)

targets = list(set(Y))
pandasFrame = pandasFrame.drop(columns="link_flair_text")

<h2> Find Correlation between various features </h2>

In [None]:
tfidVectorizer = {}

X = pandasFrame
tempDataFrame = pandasFrame
newTest = pd.DataFrame()

# Text fields to tf-idf
for column in X.columns:
    if(X[column].dtype == 'object'):
        tfid = TfidfVectorizer()
        tfidVectorizer[column] = tfid.fit(tempDataFrame[column])
        test = pd.DataFrame(tfidVectorizer[column].transform(tempDataFrame[column]).todense(),columns=tfidVectorizer[column].get_feature_names())
        newTest = pd.concat([newTest,test], axis=1)
        tempDataFrame = tempDataFrame.drop(columns=column)
        
tempDataFrame = pd.concat([tempDataFrame, pd.DataFrame({'labels':factorizedLabels[0]})], axis=1)
tempDataFrame.corr(method ='pearson')['labels']

<h2> Remove Highly Correlated and Redundant Features </h2>

In [None]:
pandasFrame = pandasFrame.drop(columns="author_fullname")
pandasFrame = pandasFrame.drop(columns="author_flair_text")
pandasFrame = pandasFrame.drop(columns="is_crosspostable")
pandasFrame = pandasFrame.drop(columns="is_meta")
pandasFrame = pandasFrame.drop(columns="is_original_content")
pandasFrame = pandasFrame.drop(columns="is_robot_indexable")
pandasFrame = pandasFrame.drop(columns="is_self")
pandasFrame = pandasFrame.drop(columns="is_video")
pandasFrame = pandasFrame.drop(columns="media_only")
pandasFrame = pandasFrame.drop(columns="permalink")
pandasFrame = pandasFrame.drop(columns="total_awards_received")
pandasFrame = pandasFrame.drop(columns="post_hint")

<h2>Divide Data into training and test data</h2>

In [None]:
tfidVectorizer = {}

X = pandasFrame
X_train, X_test, y_train, y_test = train_test_split(X, factorizedLabels[0], test_size=0.3, random_state = 42)

newDFTrain = pd.DataFrame()
newDFTest = pd.DataFrame()

stop_words = set(stopwords.words("english"))
# Text fields to tf-idf
for column in X.columns:
    if(X[column].dtype == 'object'):
        print(column)
        tfid = TfidfVectorizer(max_df=0.85,stop_words=stop_words,max_features=1500,smooth_idf=True,use_idf=True)
        tfidVectorizer[column] = tfid.fit(X_train[column])
        train = pd.DataFrame(tfidVectorizer[column].transform(X_train[column]).todense(),columns=tfidVectorizer[column].get_feature_names())
        test = pd.DataFrame(tfidVectorizer[column].transform(X_test[column]).todense(),columns=tfidVectorizer[column].get_feature_names())
        newDFTrain = pd.concat([newDFTrain,train], axis=1)
        newDFTest = pd.concat([newDFTest,test], axis=1)
        X_train = X_train.drop(columns=column)
        X_test = X_test.drop(columns=column)
        
#  Indexing Problem Resolution
X_train.reset_index(drop=True, inplace=True)
newDFTrain.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
newDFTest.reset_index(drop=True, inplace=True)

X_train = pd.concat([X_train, newDFTrain], axis=1)
X_test = pd.concat([X_test,newDFTest], axis=1)

<h2>Selecting best Classifiers after scaling data to common scale</h2>

In [None]:
pipelines = []

pipelines.append(('ScaledSGDClassifier', Pipeline([('Scaler', StandardScaler()),('SGDClassifier',SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None))])))
pipelines.append(('ScaledLogisticRegression', Pipeline([('Scaler', StandardScaler()),('LogisticRegression',LogisticRegression())])))
pipelines.append(('ScaledKNeighborsClassifier', Pipeline([('Scaler', StandardScaler()),('KNeighborsClassifier', KNeighborsClassifier())])))
pipelines.append(('ScaledRandomForestClassifier', Pipeline([('Scaler', StandardScaler()),('RandomForestClassifier', RandomForestClassifier())])))
pipelines.append(('ScaledMLPClassifier', Pipeline([('Scaler', StandardScaler()),('MLPClassifier', MLPClassifier())])))
pipelines.append(('ScaledDecisionTreeRegressor', Pipeline([('Scaler', StandardScaler()),('DecisionTreeRegressor', DecisionTreeRegressor())])))

results = []
names = []
for name, model in pipelines:
    kfold = KFold(n_splits=10, random_state=21)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='neg_mean_squared_error')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

<h2>Generating Training Models and finding the most accurate model</h2>

In [None]:
models = []

def bayesClassifier(X_train,X_test,y_train,y_test):
    classifier = MultinomialNB()
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    print('Naive Bayes Accuracy: ', accuracy)
    models.append([accuracy,classifier])
    
def randomForestClassifier(X_train,X_test,y_train,y_test):
    classifier = RandomForestClassifier()
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    print('Random Forest Accuracy: ', accuracy)
    models.append([accuracy,classifier])
    
def kNeighborsClassifier(X_train,X_test,y_train,y_test):
    classifier = KNeighborsClassifier()
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    print('K Neighbors Accuracy: ', accuracy)
    models.append([accuracy,classifier])
    
def stochasticGradientClassifier(X_train,X_test,y_train,y_test):
    classifier = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    print('Stochastic Gradient Decent Accuracy: ', accuracy)
    models.append([accuracy,classifier])
    
def logisticRegressionClassifier(X_train,X_test,y_train,y_test):
    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    print('Logistic Regression Accuracy: ', accuracy)
    models.append([accuracy,classifier])
    
bayesClassifier(X_train,X_test,y_train,y_test)
randomForestClassifier(X_train,X_test,y_train,y_test)
kNeighborsClassifier(X_train,X_test,y_train,y_test)
stochasticGradientClassifier(X_train,X_test,y_train,y_test)
logisticRegressionClassifier(X_train,X_test,y_train,y_test)

<h3>Saving the best Model</h3>

In [None]:
def saveModel(models,tfidVectorizer):
    models.sort(key=lambda x: x[0], reverse=True)
    trainedModel = models[0][1]
    pickle_out_Model = open("trainedModel.pickle","wb")
    pickle_out_Vectorizer = open("vectorizer.pickle", "wb")
    pickle_out_Label = open("labels.pickle", "wb")
    pickle.dump(trainedModel, pickle_out_Model)
    pickle.dump(tfidVectorizer, pickle_out_Vectorizer)
    pickle.dump(factorizedLabels[1], pickle_out_Label)
    pickle_out_Model.close()
    pickle_out_Vectorizer.close()
    pickle_out_Label.close()

saveModel(models, tfidVectorizer)

<h3>References</h3>
<p>https://towardsdatascience.com/your-first-kaggle-competition-submission-64da366e48cb</p>
<p>https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568</p>
<p>https://www.kaggle.com/junkal/selecting-the-best-regression-model</p>
<p>https://towardsdatascience.com/natural-language-processing-on-multiple-columns-in-python-554043e05308</p>