## In this project, we'll be predicting the number of upvotes the articles received, based on their title. Because upvotes are an indicator of popularity, we'll discover which types of articles tend to be the most popular.

In [3]:
client_id = 'c6aloydyNEh81A'
secret = 'A6kOhLnbyLUI-zUU4i19pOZLTYg'

#-----------------Import Modules-----------------#
import praw
import pandas as pd
from datetime import datetime
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

#-----------------Define variables-----------------#
subreddit = 'python'
data_list = {}
rows = 10000


#-----------------connect to reddit api using Praw wrapper-----------------#
reddit = praw.Reddit(client_id= client_id, client_secret = secret, username = '',
                     password  ='PuzzledTarget', user_agent ='Incubator_project')

#-----------------extract data from the subreddit -----------------#
subreddit = reddit.subreddit(subreddit)

# extracting from the hot tab ( Time and votes are considered for hot rating)
hot_python = subreddit.hot(limit = rows)


for submission in hot_python:
    if not submission.stickied:
        if submission.id not in data_list :
            created_time = (datetime.utcfromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S'))
            data_list[submission.id] =[submission.title, submission.ups, submission.downs, created_time, submission.upvote_ratio]
        else:
            raise ValueError('duplicate id found')


In [21]:
#-----------------Create a pandas DataFrame out of the dictionary -----------------#
columns = ['title' , 'ups' , 'downs', 'create_time' , 'upvote_ratio' ]
reddit_submission = pd.DataFrame.from_dict(data_list, orient = 'index' , columns = columns)
reddit_submission.head(5)

Unnamed: 0,title,ups,downs,create_time,upvote_ratio
cix5ek,I made a script that uses the mouse and keyboa...,1466,0,2019-07-28 15:28:37,0.98
cj143q,Robot with live feed built using pynetwork,102,0,2019-07-28 20:41:37,0.96
civysm,I'm trying to make a flappy bird replica in py...,336,0,2019-07-28 13:43:52,0.93
cj9he8,An interview covering what you need to know ab...,4,0,2019-07-29 10:33:45,1.0
cj958g,Building a PEG Parser,3,0,2019-07-29 09:56:51,1.0


In [22]:
# let explore the dataset 
reddit_submission.shape

(910, 5)

In [104]:
# there are no nulls in the dataset
print(20*'-' + 'null rows '+ 20*'-')
reddit_submission[reddit_submission.isnull().any(axis=1)]

# let is look at the  ups colums 
print(20*'-' + 'ups distribution '+ 20*'-')
reddit_submission['ups'].value_counts()
#365 elements out 910 have 0 votes

# let is look at the upvote_ratio colums  - There is atleast one upvote
print(20*'-' + 'upvote_ratio '+ 20*'-')
reddit_submission['upvote_ratio'].value_counts().sort_index().head()


# let is look at the time ups colums  - There is atleast one upvote

--------------------null rows --------------------


False

In [24]:
# covert each title into a numerical repesentation
tokenized_title = []

for item in reddit_submission['title']:
    tokenized_title.append(item.split(" "))
    
# lowercase all the items and removing punctuations
punctuation = [",", ":", ";", ".", "'", '"', "’", "?", "/", "-", "+", "&", "(", ")", "|" , ">" , "<" , "[" , "]" , "-"]

clean_tokenized = []

for item in tokenized_title:
    tokens = []
    for token in item:
        token = token.lower()
        for punc in punctuation:
            token = token.replace(punc, "")
        if token != "":
            tokens.append(token)
    clean_tokenized.append(tokens)
clean_tokenized

# find all the unique token in clean_tokenised and assign the result to unique tokenize. Any token occuring
#only 1 time will not be counted as 

unique_tokens = []
single_token =[]

for item in clean_tokenized:
    for element in item:
        if element not in  single_token:
            single_token.append(element)
        elif element in single_token and element not in unique_tokens:
            unique_tokens.append(element)

len(unique_tokens)


# initialising DataFrame to hold the numeric values for each token 
counts =  pd.DataFrame(0, index = np.arange(len(clean_tokenized)) , columns = unique_tokens)

In [25]:
for i,item in enumerate(clean_tokenized):
    for element in item:
        if element in unique_tokens:
            counts.iloc[i][element] +=1 
        else:
            continue



In [99]:
word_counts = counts.sum(axis =0)
word_counts.sort_values(ascending = False).head(15)

that       50
it         49
an         44
code       43
can        41
what       39
you        38
data       38
made       33
or         31
web        30
script     30
project    30
any        29
need       28
dtype: int64

Features or words occuring too few times will result in overfillting These feature will probably correlates differently with upvote in training set and testing set.Features or words occuring too many times will also cause issue (stopwords - such as 'and','or' etc). They do not add any information to the model. 

After having a look at the word_count distribution, to make the model better we reduce the feature by removing words that occur less than 5 times or more than 50 times


In [30]:
counts = counts.loc[:,(word_counts >=5) & (word_counts <=50)]
counts.shape

(910, 249)

Now we will split the data into 2 sets. Test and train to evaluate the algorithm effectively. we will select 20% of our rows for test and 80% of our rows for training. We will use linear regression algorithm.

In [31]:
X_train, X_test, y_train, y_test = train_test_split(counts, reddit_submission["ups"], test_size=0.2, random_state=1)

lr = LinearRegression()
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)

lets us calculate the MSE (mean square error associated with our predictions)

In [32]:
mse = ((predictions - y_test)**2).sum()/len(predictions)
mse**(1/2)

202.81453211430687

In [106]:
reddit_submission["ups"].describe()[1]

33.785714285714285

For the dataset size of 986 rows - 
the mean for our up_votes is 33 and the std deviation is 170. if we take the square root of out mse, we get 182.5.This means that the average error is 182  upvotes away from the true value. This is highter than the standard deviation, so out predictions are far off the base.
Let us go ahead and increase the size of the data used



In [97]:
from sklearn.linear_model import Ridge
import numpy
import random

X_train, X_test, y_train, y_test = train_test_split(counts, reddit_submission["ups"], test_size=0.2, random_state=1)



train_rows = int(counts.shape[0]* .8)
# Set a seed to get the same "random" shuffle every time.
random.seed(1)

# Shuffle the indices for the matrix.
indices = list(range(counts.shape[0]))
random.shuffle(indices)


# Create train and test sets.
X_train_ridge = counts.loc[indices[:train_rows], :]
X_test_ridge = counts.loc[indices[train_rows:], :]
y_train_ridge = reddit_submission["ups"].iloc[indices[:train_rows]]
y_test_ridge = reddit_submission["ups"].iloc[indices[train_rows:]]
X_train_ridge = numpy.nan_to_num(X_train_ridge)

# Run the regression and generate predictions for the test set.
reg = Ridge(alpha=.1)
reg.fit(X_train_ridge, y_train_ridge)
predictions_ridge = reg.predict(X_test_ridge)

In [98]:
mse = ((predictions - test_upvotes)**2).sum()/len(test_upvotes)
mse**(1/2)

196.16180979416899

In [100]:
reddit_submission["ups"].describe()[2]

174.72759000451327

In [139]:
predictions = pd.DataFrame(data = predictions , index = test.index )
reddit_submission.index = counts.index
reddit_submission.loc[test.index,:]
reddit_predictions = pd.merge(reddit_submission, predictions, left_index = True, right_index = True)
reddit_predictions = reddit_predictions.reindex()
reddit_predictions['predicted_ups'] = reddit_predictions[0] 

In [111]:
# top 5 posts that have maximum predicted up_votes
# top 5 posts that have maximum predicted up_votes
top = {}
top['python'] = np.array[reddit_predictions.sort_values('predicted_ups' , ascending = False)['title'].head()]

TypeError: 'builtin_function_or_method' object is not subscriptable

In [140]:
reddit_predictions.sort_values('predicted_ups' , ascending = False)['title'].head()

563    Greetings Python enthusiasts of reddit, I seek...
906    I made a program which encrypts the RGB value ...
712    I wrote a tiny Python API that notifies you if...
662    I made a small Python program to automatically...
824    I made a toy renderer to work on shaders. PyOp...
Name: title, dtype: object

In [152]:
top = reddit_predictions.sort_values('predicted_ups' , ascending = False)[['title', 'predicted_ups']].head()

top = top.reindex()
top

Unnamed: 0,title,predicted_ups
563,"Greetings Python enthusiasts of reddit, I seek...",814.157433
906,I made a program which encrypts the RGB value ...,804.525155
712,I wrote a tiny Python API that notifies you if...,532.764095
662,I made a small Python program to automatically...,512.430284
824,I made a toy renderer to work on shaders. PyOp...,486.349808


In [136]:
reddit_predictions[['title', 'predicted_ups']].head()

TypeError: 'method' object is not subscriptable

In [138]:
reddit_predictions = reddit_predictions.reindex()

AttributeError: 'function' object has no attribute 'reindex'