# Reddit Posts NLP

Context
The dataset contains the Reddit posts of Indian region.

Content
This data was taken out from Reddit with the help of their easy to use api.
It contains various features such as post's title, url, description, flair etc. It contains approx. 220 posts for each of the following flair:

AskIndia
Non-Political
Scheduled
Photography
Science/Technology
Politics
Business/Finance
Policy/Economy
Sports
Food
AMA
Inspiration
The main target is to develop a prediction model accurate enough for predicting the flair of a Reddit post.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [216]:
pd.set_option('display.max_columns', 15)

# Functions

In [11]:
def num_unique_values_in_cols(df,cols = None):
    if cols != None:
        if type(cols) == str:
            cols = [cols]
        d_unique = {}
        for col in cols:
            d_unique[col] = df[col].value_counts().count()
    return d_unique
            

# Common Fuctions Across Algorithms

In [67]:
def convert_cat(df, cat_cols,output_col=None):
    """
    This function converts all categorical columns into numerical boolean columns.
    
    There are 3 parameters: df, cat_cols, and output_var. 
    
    1. df is the dataframe which needs to have categorical variables converted to numerical variables
    2. cat_cols needs to be a list that contains the names of all categorical columns that need to be converted.
    3. output_var is the name of the output or response variable.  It is set to 'Output' as default."""
    
    df.reset_index(inplace = True, drop = True)
    df_out = df.copy()
    
    for col in cat_cols:
        dummy_col = pd.get_dummies(df_out[col],drop_first = True)
        df_out.drop(col,axis=1, inplace = True)
        df_out = df_out.join(dummy_col)
    
    if output_col != None:
        loc_df_out = list(df_out.columns).index(output_col)
        df_out = df_out[list(df_out.columns[:loc_df_out]) + list(df_out.columns[loc_df_out + 1:]) + list(df_out.columns[loc_df_out:loc_df_out + 1])]
    return df_out

In [3]:
def normalize(df,cat_class = None):
    """
    This function takes in a dataframe as a parameter and returns the same dataframe with all the features normalized between 0 and 1 using rescaling (min-max normalization)
    """
    l_min = []
    l_max = []
    if cat_class == None:
        for col in df.columns:
            l_min.append(df.describe()[col]['min'])
            l_max.append(df.describe()[col]['max'])
        
        t_min = list(zip(df.columns, l_min))
        t_max = list(zip(df.columns, l_max))
 

    else:
        for col in df.drop(cat_class,axis = 1).columns:
            l_min.append(df.describe()[col]['min'])
            l_max.append(df.describe()[col]['max'])

        t_min = list(zip(df.drop(cat_class,axis = 1).columns, l_min))
        t_max = list(zip(df.drop(cat_class,axis = 1).columns, l_max))
    
   
    d_min = {}
    for col,val in t_min:
        d_min[col]=val
    
    d_max = {}
    for col,val in t_max:
        d_max[col]=val
    
    df_copy = df.copy()
    for key in d_min.keys():
        df_copy[key] = df_copy[key].apply(lambda x: (x - d_min[key])/ (d_max[key] - d_min[key]))
    
    return df_copy

In [4]:
def split_df(df,test_size = 0.3,df_to_return = 'df_train'):
    """
    This function takes in a Pandas DataFrame and returns a 
    dataframe that is a subset of that Pandas DataFrame.
    
    There are 3 parameters: df, test_size, and df_to_return
    
    df needs to be a Pandas DataFrame and is the superset dataframe to be divided.
    test_size is the proportion of the dataframe you want to be the testing dataset.
    test_size is set to 0.3 by default.
    df_to_return needs to specified as either 'df_train' or df_test' 
    to return the correct subset dataframe. df_to_return is set to 'df_train' by default
    """
    split_num = int(df.count()[0] * (1-test_size) //1)
    df_train = df.iloc[:split_num,:]
    df_test = df.iloc[split_num:,:]
    if df_to_return in ['df_train','train']:
        return df_train
    elif df_to_return in ['df_test','test']:
        return df_test

In [218]:
def check_accuracy(df = None,pred_df = None, test_df = None, algo = 'lin',target_class = None):
    """
    This function takes in a pandas DataFrame and returns the accuracy of the model
    
    There are 2 parameters: df and algo
    
    1. df needs to be a Pandas DataFrame and algo is the algorithm used.
    2. algo is set to 'lin' by default but can also be specified as 'log'"""
    
    if algo == 'lin':
        df_out = df.copy()
        df_out['error'] = df.iloc[:,-2] - df.iloc[:,-1]
        RMSE = (sum(df_out['error']**2)/df_out.count()[0]+1) ** 0.5
        acc = (np.mean(df_out.iloc[:,-3]) - RMSE) / np.mean(df_out.iloc[:,-3])
        return {'acc':acc,'RMSE':RMSE}
    
    elif algo == 'log':
        return sum(df['Correct?']/df.count()[0])
    
    elif algo == 'knn':
        pred = pred_df[target_class]
        test = test_df[target_class]
        return sum(pred == test) / len(pred)

# Linear Regression and Logistic Regression

In [404]:
def stoch_grad_desc(dataset,output_col,cols_to_ignore = None,alpha = 0.3,epoch = 10,algo = 'lin',initial_coeffs = 1):
    """
    This function returns a list of the coefficients for the specified algorithm.  
    Currently, this function only performs Linear and Logistic Regression.
    
    The 4 parameters are: dataset, alpha, epoch, and algo
    
    1. dataset needs to be a pandas DataFrame
    2. alpha is the alpha value used in stochiastic gradient descent.  It is set at 0.3 by default.
    3. epoch is the number of iterations through each row in the dataset algorithm will perform.  epoch is set to 10 by default.
    4. algo is the specific algorithm to be used.  algo is 'lin' by default for Linear Regression but can also be specified as 'log' for Logistic Regression
    """

    loc_output_col = list(dataset.columns).index(output_col)
    dataset = dataset[list(dataset.columns[:loc_output_col]) + list(dataset.columns[loc_output_col+1:]) + list(dataset.columns[loc_output_col:loc_output_col + 1])]
    
    from math import exp
    count_rows = dataset.count()[1]
    
    if cols_to_ignore != None:
        dataset_1 = pd.DataFrame(pd.Series(np.ones(dataset.count()[0])),columns = ['X0']).join(dataset.drop(cols_to_ignore,axis = 1))
    else:
        dataset_1 = pd.DataFrame(pd.Series(np.ones(dataset.count()[0])),columns = ['X0']).join(dataset)
        
    coeffs = list(np.ones(len(dataset_1.columns[0:-1])))
    coeffs = [i * initial_coeffs for i in coeffs]
    
    df_columns = list(dataset_1.columns)
    df_input_cols = df_columns
    df_input_cols.remove(output_col)
    
    for ep in range(epoch):
        for row in range(count_rows):
            y = dataset_1.loc[row,output_col]
            output_terms = []
            #return y
            for col in df_input_cols:
                output_terms.append((coeffs[dataset_1.columns.get_loc(col)],dataset_1.loc[row,col]))
            
            output_list = [(x*y) for (x,y) in output_terms]
            
            output = sum(output_list)
            
            if algo == 'lin':
                pred = output
                for i in range(len(coeffs)):
                    coeffs[int(i)] += alpha * (dataset_1.loc[row,output_col] - pred) * dataset_1.iloc[row,int(i)]
                
            elif algo == 'log':
                pred = 1 / (1 + exp(-output))
                
                for i in range(len(coeffs)):
                    coeffs[i] = coeffs[i] + alpha * (y - pred) * pred * (1 - pred) * dataset_1.iloc[row,i]
                    
    
    
    return coeffs

In [462]:
def make_predictions(dataset, coeff_list, output_col,cols_to_ignore = None,algo = 'lin'):
    """
    This function takes in a pandas DataFrame and a list that contains 
    coefficients for the specified algorith used in the stoch_grad_desc function
    and returns the same dataset (with the addition of the first column being 1s to
    represent X0 in the regression formula) plus a new column at the end, 'Prediction'.
    
    There are 3 parameters: dataset, coeff_list, and algo
    
    1. dataset is the dataframe to used to make predictions dataset needs to be a Pandas DataFrame
    2. coeff_list should be the list that was the result of running the stoch_grad_desc function
    3. algo is the specific algorithm to be used.  algo is set to 'lin' by default but can be set to 'log'"""
    
    loc_output_col = list(dataset.columns).index(output_col)
    dataset = dataset[list(dataset.columns[:loc_output_col]) + list(dataset.columns[loc_output_col+1:]) + list(dataset.columns[loc_output_col:loc_output_col + 1])]
    dataset_index = dataset.index
    
    if type(cols_to_ignore) == list and cols_to_ignore != None:
        df_ignored_cols = dataset.loc[:,cols_to_ignore]
    elif type(cols_to_ignore) != list and cols_to_ignore != None:
        df_ignored_cols = dataset.loc[:,[cols_to_ignore]]
    if cols_to_ignore != None:
        df_ignored_cols.reset_index(inplace = True)
    
    from math import exp
    
    dataset.reset_index(inplace = True, drop = True)
    
    if cols_to_ignore != None:
        dataset_out = pd.DataFrame(pd.Series(np.ones(dataset.count()[0]))).join(dataset.drop(cols_to_ignore,axis = 1))
    else:
        dataset_out = pd.DataFrame(pd.Series(np.ones(dataset.count()[0]))).join(dataset)

    
    dataset_out.rename(mapper = {0:'X0'},axis = 1, inplace = True)
    
    coeffs = coeff_list
    pred = []
    
    for row in range(dataset_out.count()[0]):
        output_terms = []
        for col in dataset_out.columns[0:-1]:
            output_terms.append((coeffs[dataset_out.columns.get_loc(col)],dataset_out.loc[row,col]))
        output_list = [x*y for (x,y) in output_terms]
        
        output = sum(output_list)
        if algo == 'lin':
            pred.append(output)
        
        elif algo == 'log':
            pred.append(1/(1 + exp(-output)))
    dataset_out = dataset_out.join(pd.DataFrame(pred))
    dataset_out.rename(mapper = {0: 'Prediction'},axis = 1, inplace = True)
    
    if algo == 'log':
        dataset_out['Crisp'] = dataset_out['Prediction'].apply(lambda predi: 1 if predi >= 0.5 else 0)
        dataset_out['Correct?'] = dataset_out.iloc[:,-3] == dataset_out['Crisp']
    
    if cols_to_ignore != None:
        dataset_out = pd.concat([df_ignored_cols,dataset_out],axis = 1)
    dataset_out.drop('X0',axis = 1,inplace = True)
    
    dataset_out.set_index(dataset_index,inplace = True)
    return dataset_out

# K Nearest Neighbors

In [185]:
def knn_1pt(df,new_point,output_col,k = 3):
    import statistics
    df1 = df.copy()
    new_pt_df = pd.DataFrame(data = [new_point + ['DK']],columns=df.columns)
    df_out = df1.append(new_pt_df,ignore_index = True)
    
    df2 = df_out.drop(output_col,axis=1)
    count_rows = df2.count()[0]
    new_pt_ind = count_rows - 1
    df2['sum_sqrd_diffs'] = 0
    for row_num in range(0,count_rows):
        sum_sqrd_diffs = 0
        for col_num in range(0,len(df2.columns)):
            sum_sqrd_diffs += (df2.iloc[new_pt_ind,col_num] - df2.iloc[row_num,col_num])**2
        df_out.loc[row_num,'sum_sqrd_diffs'] = sum_sqrd_diffs 
       
    df_out.loc[new_pt_ind,output_col] = statistics.mode(df_out.iloc[:new_pt_ind,:].sort_values('sum_sqrd_diffs').head(k)[output_col])
    #return df_out.sort_values('sum_sqrd_diffs').head(10)
    return df_out
              
    

In [212]:
def knn(df_in,new_pts_list,dep_col,k = 7):
    import statistics
    
    if type(new_pts_list) == type(df_in):
        l_list_new_pts = []
        for rn in range(new_pts_list.count()[0]):
            l_list_new_pts.append(list(new_pts_list.iloc[rn,:]))
        new_pts_list = l_list_new_pts

    def knn_1pt(df,new_point,output_col = dep_col,k = k):
        
        df1 = df.copy()
        new_pt_df = pd.DataFrame(data = [new_point + ['DK']],columns=df.columns)
        df_out = df1.append(new_pt_df,ignore_index = True)

        df2 = df_out.drop(output_col,axis=1)
        count_rows = df2.count()[0]
        new_pt_ind = count_rows - 1
        df2['sum_sqrd_diffs'] = 0
        for row_num in range(0,count_rows):
            sum_sqrd_diffs = 0
            for col_num in range(0,len(df2.columns)):
                sum_sqrd_diffs += (df2.iloc[new_pt_ind,col_num] - df2.iloc[row_num,col_num])**2
            df_out.loc[row_num,'sum_sqrd_diffs'] = sum_sqrd_diffs 
            
        
        try:
            mode = statistics.mode(df_out.sort_values('sum_sqrd_diffs')[0:k][output_col])
        except:
            closest = list(df_out.sort_values('sum_sqrd_diffs')[0:k][output_col].values)
            copy = closest[:]
            closest.sort(key = lambda x:copy.count(x))
            closest.reverse()
            mode = closest[0]
        df_out.loc[new_pt_ind,output_col] = mode
        df_in = df_out.drop('sum_sqrd_diffs',axis = 1).copy()
        return df_in
    
        """
        df_out.loc[new_pt_ind,output_col] = statistics.mode(df_out.iloc[:new_pt_ind,:].sort_values('sum_sqrd_diffs').head(k)[output_col])
        df_in = df_out.drop('sum_sqrd_diffs',axis = 1).copy()
        return df_in
        """
    
    for np in new_pts_list:
        df_in =  knn_1pt(new_point = np,df=df_in).copy()
    return df_in

In [28]:
import statistics

In [None]:
try:
    mode = statistics.mode(df.sort_values('sum_sqrd_diffs')[0:k][output_col])
except:
    closest = list(df.sort_values(sum_sqrd_diffs)[0:k][output_col].values)
    copy = closest[:]
    closest.sort(key = lambda x:copy.count(x))
    closest.reverse()
    mode = closest[0]

In [204]:
df = pd.DataFrame({'col1':[1,2,3,4,2,1,4],'col2':[2,5,7,3,1,4,20]})

In [205]:
df.reset_index(inplace = True)

In [209]:
statistics.mode(df.sort_values('col2')[0:4]['col1'])

1

In [77]:
l = list(df.sort_values('col2')[0:5]['col1'].values)
l
B = l[:]
l

[2, 1, 4, 5, 2]

In [84]:
l.sort(key=lambda x:B.count(x))
l.reverse()

In [80]:
l

[2, 2, 5, 4, 1]

In [94]:
try:
    mode = statistics.mode(df.sort_values('col2')[0:5]['col1'])
except:
    closest = list(df.sort_values('col2')[0:5]['col1'].values)
    copy = closest[:]
    closest.sort(key = lambda x:copy.count(x))
    closest.reverse()
    mode = closest[0]
    

closest,mode
    

([2, 1, 1, 2, 4], 2)

# KMeans

In [70]:
def kmeans(df,k=3,epoch = 1):
    import random
    
    def euclid_squared_distance(pt1,pt2):
        return (pt1 - pt2)**2
    
    def select_k_points(dataframe = df,k = k):
        centroid_dataframe = pd.DataFrame(columns = dataframe.columns)
        centroid_indices = []
        for i in range(k):
            while True:
                rand_ind = random.choice(list(dataframe.index))
                if rand_ind not in centroid_indices:
                    centroid_indices.append(rand_ind)
                    break
            centroid_dataframe = centroid_dataframe.append(pd.DataFrame(data = [list(dataframe.loc[rand_ind,:])],columns = dataframe.columns,index = [rand_ind]))
            for row_num,ind in enumerate(list(centroid_dataframe.index)):
                centroid_dataframe.loc[ind,'Cluster'] = int(row_num)
        return centroid_dataframe
    
#     def kmeans_pp(dataframe = df,k = k):
#         centroid_dataframe = pd.DataFrame(columns = dataframe.columns)
#         centroid_indices = []
#         rand_ind = random.choice(list(dataframe.index))
#         centroid_indices.append(rand_ind)
#         centroid_dataframe.loc[rand_ind,:] = dataframe.iloc[rand_ind,:]
        
#         for i in range(k-1):
#             for row in dataframe.index:
#                 sqd_diffs_df = pd.DataFrame(columns = dataframe.columns)
#                 for cent_ind in centroid_indices:                
#                     sqd_diffs_df.loc[cent_ind,:] = (centroid_dataframe.loc[cent_ind,:] - dataframe.loc[row,:]) ** 2
#                     sqd_diffs_df['Sum_Squared_Diffs'] = sqd_diffs_df.sum(axis = 1)
#                 min_sqd_diffs = sqd_diffs_df['Sum_Squared_Diffs'].min()
#                 sqd_diffs_df=sqd_diffs_df[sqd_diffs_df['Sum_Squared_Diffs'] = 8]

            
    df['Cluster'] = np.nan
    
    centroid_df = select_k_points()

    centroid_indices = list(centroid_df.index)
    
    

    def one_iteration_k_means(dataframe = df,centroid_dataframe = centroid_df,cluster_col = 'Cluster'):
        for row in dataframe.index:
            sqd_diffs_df = pd.DataFrame(columns = centroid_dataframe.drop(cluster_col,axis = 1).columns)
            for cent_ind in centroid_indices:
                sqd_diffs_df.loc[cent_ind,:] = (centroid_dataframe.drop(cluster_col,axis = 1).loc[cent_ind,:] - dataframe.drop(cluster_col,axis = 1).loc[row,:]) ** 2 
            sqd_diffs_df['Sum_Sqd_Diffs'] = sqd_diffs_df.sum(axis = 1)
            for row_num,ind in enumerate(list(sqd_diffs_df.index)):
                sqd_diffs_df.loc[ind,'Cluster'] = int(row_num)

            smallest_ssd = sqd_diffs_df['Sum_Sqd_Diffs'].min()

            smallest_ssd_df = sqd_diffs_df[sqd_diffs_df['Sum_Sqd_Diffs'] == smallest_ssd]

            closest_centroid = pd.DataFrame(data = [smallest_ssd_df.loc[smallest_ssd_df.index[0],:].values],columns = list(sqd_diffs_df.columns),index = smallest_ssd_df.index)

            closest_centroid_ind = list(closest_centroid.index)[0]

            df.loc[row,cluster_col] = centroid_dataframe.loc[closest_centroid_ind,cluster_col]

            for row_cent in list(centroid_dataframe.index):
                
                if row_cent == closest_centroid_ind:
                    for col in list(centroid_dataframe.drop(cluster_col,axis = 1).columns):
                        centroid_dataframe.loc[row_cent,col] = (centroid_dataframe.loc[row_cent,col] + df.loc[row,col])/2
                        
        return centroid_dataframe
    
    for iteration in range(epoch):
        centroid_df = one_iteration_k_means() 

    return df

In [9]:
reddit = pd.read_csv(r'reddit_data.csv')

In [110]:
reddit = reddit.sample(frac=1).reset_index(drop=True)

In [111]:
reddit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2510 entries, 0 to 2509
Data columns (total 9 columns):
title           2510 non-null object
score           2510 non-null int64
id              2510 non-null object
subreddit       2510 non-null object
url             2510 non-null object
num_comments    2510 non-null int64
body            1563 non-null object
created         2510 non-null float64
flair           2510 non-null object
dtypes: float64(1), int64(2), object(6)
memory usage: 176.6+ KB


In [5]:
reddit.describe()

Unnamed: 0.1,Unnamed: 0,score,num_comments,created
count,2510.0,2510.0,2510.0,2510.0
mean,1254.5,147.20239,60.339044,1535346000.0
std,724.718911,643.183307,353.928483,56542800.0
min,0.0,0.0,0.0,1253877000.0
25%,627.25,9.0,4.0,1500834000.0
50%,1254.5,30.0,13.0,1558507000.0
75%,1881.75,110.0,45.0,1580475000.0
max,2509.0,17900.0,10799.0,1587680000.0


In [113]:
num_unnamed = list(reddit['Unnamed: 0'].value_counts().index)

In [7]:
num_unnamed.sort()

In [8]:
num_unnamed[-1]

2509

In [10]:
reddit.drop('Unnamed: 0',axis = 1,inplace = True)

In [114]:
reddit.head(10)

Unnamed: 0,title,score,id,subreddit,url,num_comments,body,created,flair
0,"Weekly photography thread thread - January 06,...",3,eko55x,india,https://www.reddit.com/r/india/comments/eko55x...,3,This is a weekly scheduled thread for all the ...,1578310000.0,Scheduled
1,Advice for an Aussie promoting their business ...,24,em8hu3,india,https://www.reddit.com/r/india/comments/em8hu3...,12,"Hi all,\n\n&#x200B;\n\nI hope you don't mind m...",1578599000.0,Business/Finance
2,"abcd here, anyone wanna be friends lol? ama",2,b6unig,india,https://www.reddit.com/r/india/comments/b6unig...,19,actually cbcd (canadian but came to us @ like ...,1553877000.0,AMA
3,Coronavirus (COVID-19) Megathread - News and U...,417,fqqdsg,india,https://www.reddit.com/r/india/comments/fqqdsg...,10799,###[Covid-19 Fundraisers & Donation Links](htt...,1585451000.0,AskIndia
4,Why Narendra Modi has taken the big and the ba...,41,exa23n,india,https://theprint.in/opinion/why-narendra-modi-...,2,,1580609000.0,Politics
5,"AMA Announcement: Swati Bhargava, Co-founder o...",5,6qv9k6,india,https://www.reddit.com/r/india/comments/6qv9k6...,121,Hello!\n\nA little while back some of you had ...,1501611000.0,AMA
6,Punjab sends 20 trains of wheat and rice to ot...,408,fpcuzw,india,https://www.nationalheraldindia.com/india/punj...,13,,1585263000.0,Food
7,[Photography] Durga Pooja celebration aftermat...,250,9p62k4,india,https://imgur.com/KJQjz2J,32,,1539867000.0,Photography
8,Caste Of A Person Cannot Be Changed By Virtue ...,26,djv5iw,india,https://www.livelaw.in/news-updates/caste-of-a...,6,,1571468000.0,Scheduled
9,People who test negative might also have coron...,44,fywlyr,india,https://www.reddit.com/r/india/comments/fywlyr...,3,>scientists have warned about a growing concer...,1586609000.0,Science/Technology


In [115]:
reddit['flair'].value_counts()

Politics              247
Food                  242
Scheduled             234
Business/Finance      233
AskIndia              232
Sports                231
Photography           222
Science/Technology    221
Policy/Economy        220
Non-Political         216
AMA                   212
Name: flair, dtype: int64

# NLTK

In [11]:
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [25]:
string = reddit.loc[0,'body']

In [24]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [62]:
tokens = word_tokenize(string)

In [63]:
tokens

['xposted',
 'from',
 '/r/askmen',
 ',',
 'posted',
 'here',
 'because',
 'India',
 'is',
 'socially',
 'and',
 'culturally',
 'different',
 'and',
 'more',
 'importantly',
 'will',
 'get',
 'more',
 'relevant',
 'answers',
 '.']

In [64]:
clean_tokens = []
for token in tokens:
    if token not in stopwords.words('english'):
        clean_tokens.append(token)

In [73]:
clean_tokens

['xposted',
 '/r/askmen',
 ',',
 'posted',
 'India',
 'socially',
 'culturally',
 'different',
 'importantly',
 'get',
 'relevant',
 'answers',
 '.']

In [80]:
clean_tokens == clean_tokens_2

False

### Manually attaining clean token list

Manually attaining clean token list is not as accurate as it will include commas and other punctuation with the previous word

In [82]:
clean_tokens_2 = []
for i in string.split(' '):
    if i not in stopwords.words('english'):
        clean_tokens_2.append(i)

In [83]:
clean_tokens_2

['xposted',
 '/r/askmen,',
 'posted',
 'India',
 'socially',
 'culturally',
 'different',
 'importantly',
 'get',
 'relevant',
 'answers.']

2nd way to manually attain clean tokens

In [57]:
def bool_not_stop_word(char):
    from nltk.corpus import stopwords
    if char in stopwords.words('english'):
        return False
    else:
        return True

In [86]:
list(filter(bool_not_stop_word,string.split(' ')))

['xposted',
 '/r/askmen,',
 'posted',
 'India',
 'socially',
 'culturally',
 'different',
 'importantly',
 'get',
 'relevant',
 'answers.']

In [85]:
list(filter(bool_not_stop_word,string.split(' '))) == clean_tokens_2

True

In [90]:
reddit[reddit['body'] == np.nan]

Unnamed: 0,title,score,id,subreddit,url,num_comments,body,created,flair


In [171]:
d = {}

In [172]:
d[0] == 1 

KeyError: 0

In [179]:
reddit.head(2)

Unnamed: 0,title,score,id,subreddit,url,num_comments,body,created,flair
0,"Attractive men, what type of attention do you ...",20,cprtaa,india,https://www.reddit.com/r/india/comments/cprtaa...,21,"xposted from /r/askmen, posted here because In...",1565725000.0,AskIndia
1,[askindia] My brothers bank account was hacked...,42,dacmel,india,https://www.reddit.com/r/india/comments/dacmel...,64,My brother received bunch of SMS this morning ...,1569682000.0,AskIndia


In [266]:
def create_token_cols(df,text_col):
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    from nltk.corpus import wordnet as wn
    from nltk.stem import WordNetLemmatizer
    from time import time
    df_cols_orig = list(df.columns)
    lemmatizer = WordNetLemmatizer()
    
    stops = stopwords.words('english')
    
    def lower_text(df = df,text_col = text_col):
        df_u = df.copy()
        df_u[text_col] = df_u[text_col].apply(lambda string: string.lower() if type(string) == str else string)
        return df_u
    
    df = lower_text()
    
    def create_tok_dict(df = df,text_col = text_col):
        start_time = time()
        tok_dict = {}
        for i in df.index:
            text = df.loc[i,text_col]
            if type(text) == str:
                tokens = word_tokenize(text)
                clean_tokens = []
                for token in tokens:
                    if token not in stops:
                        clean_tokens.append(token)
                        
                lemmatized_tokens = []        
                d_tokens_pos = {}
#                 return d_tokens_pos
                for token in clean_tokens:
                    try:
                        tmp = wn.synsets(token)[0].pos()
                        d_tokens_pos[token] = tmp
                    except:
                        d_tokens_pos[token] = 'none'
                        lemmatized_tokens.append(token)   
                for tok in clean_tokens:
                    if d_tokens_pos[tok] == 'n':
                        lemmatized_tokens.append(lemmatizer.lemmatize(tok,pos = 'n'))
                    elif d_tokens_pos[tok] == 'v':
                        lemmatized_tokens.append(lemmatizer.lemmatize(tok,pos = 'v'))
                    elif d_tokens_pos[tok] == 'a':
                        lemmatized_tokens.append(lemmatizer.lemmatize(tok,pos = 'a'))
                    elif d_tokens_pos[tok] == 'r':
                        lemmatized_tokens.append(lemmatizer.lemmatize(tok,pos = 'r')) 
                    else:
                        pass

                
                tok_dict[i] = lemmatized_tokens
        end_time = time()
        print("Creating the token dictionary took " + str(end_time - start_time) + " seconds.")
        return tok_dict
                
    tok_dict = create_tok_dict()
    
    def create_new_cols(df= df,text_col = text_col,tok_dict = tok_dict,df_cols_orig = df_cols_orig):
        
        df_out = df.copy()
        
        d_freqs = {}
        all_words = []
        
        start_time = time()
        
        for ind in list(tok_dict.keys()):  
            d_freqs[ind] = nltk.FreqDist(tok_dict[ind])
            for word in tok_dict[ind]:
                if word not in df_cols_orig:
                    df_out.loc[ind,word] = d_freqs[ind][word]
                else:
                    df_out.loc[ind,'text' + str(word)] = d_freqs[ind][word]
        
        end_time = time()
        print("Creating New Columns took " + str(end_time - start_time) + " seconds.")
        return df_out
    
    final_df = create_new_cols()
    return final_df
        
                
        

In [45]:
def create_new_cols(df= reddit,text_col = 'body',tok_dict = final_df):
    from time import time
    df_out = df.copy()
    
    all_words = []

    start_time = time()

    #for ind in list(tok_dict.keys()):
    for ind in range(0,10):

        d_freqs = nltk.FreqDist(tok_dict[ind])
        for word in tok_dict[ind]:
            df_out.loc[ind,'t_%s' % word] = d_freqs[word]

    end_time = time()
    print(end_time - start_time)
    return df_out

In [12]:
create_new_cols()

In [436]:
final_df = create_token_cols(reddit.iloc[:180,:],'body')

Creating the token dictionary took 1.4581027030944824 seconds.
Creating New Columns took 63.08543086051941 seconds.


In [402]:
final_df

Unnamed: 0,title,score,id,subreddit,url,num_comments,body,...,automatically,cause,apparently,change,anymore,beating,therapy
0,"Weekly photography thread thread - January 06,...",3,eko55x,india,https://www.reddit.com/r/india/comments/eko55x...,3,this is a weekly scheduled thread for all the ...,...,,,,,,,
1,Advice for an Aussie promoting their business ...,24,em8hu3,india,https://www.reddit.com/r/india/comments/em8hu3...,12,"hi all,\n\n&#x200b;\n\ni hope you don't mind m...",...,,,,,,,
2,"abcd here, anyone wanna be friends lol? ama",2,b6unig,india,https://www.reddit.com/r/india/comments/b6unig...,19,actually cbcd (canadian but came to us @ like ...,...,,,,,,,
3,Coronavirus (COVID-19) Megathread - News and U...,417,fqqdsg,india,https://www.reddit.com/r/india/comments/fqqdsg...,10799,###[covid-19 fundraisers & donation links](htt...,...,,,,,,,
4,Why Narendra Modi has taken the big and the ba...,41,exa23n,india,https://theprint.in/opinion/why-narendra-modi-...,2,,...,,,,,,,
5,"AMA Announcement: Swati Bhargava, Co-founder o...",5,6qv9k6,india,https://www.reddit.com/r/india/comments/6qv9k6...,121,hello!\n\na little while back some of you had ...,...,,,,,,,
6,Punjab sends 20 trains of wheat and rice to ot...,408,fpcuzw,india,https://www.nationalheraldindia.com/india/punj...,13,,...,,,,,,,
7,[Photography] Durga Pooja celebration aftermat...,250,9p62k4,india,https://imgur.com/KJQjz2J,32,,...,,,,,,,
8,Caste Of A Person Cannot Be Changed By Virtue ...,26,djv5iw,india,https://www.livelaw.in/news-updates/caste-of-a...,6,,...,,,,,,,
9,People who test negative might also have coron...,44,fywlyr,india,https://www.reddit.com/r/india/comments/fywlyr...,3,>scientists have warned about a growing concer...,...,,,,,,,


In [354]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Columns: 4380 entries, title to previously
dtypes: float64(4372), int64(2), object(6)
memory usage: 3.3+ MB


In [437]:
final_df_copy = final_df.copy()

In [438]:
final_df_copy.fillna(0,inplace = True)

In [439]:
final_df_copy.head()

Unnamed: 0,title,score,id,subreddit,url,num_comments,body,...,induce,medication,tire,sanctum,apollo,cradle,undertake
0,"Weekly photography thread thread - January 06,...",3,eko55x,india,https://www.reddit.com/r/india/comments/eko55x...,3,this is a weekly scheduled thread for all the ...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Advice for an Aussie promoting their business ...,24,em8hu3,india,https://www.reddit.com/r/india/comments/em8hu3...,12,"hi all,\n\n&#x200b;\n\ni hope you don't mind m...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"abcd here, anyone wanna be friends lol? ama",2,b6unig,india,https://www.reddit.com/r/india/comments/b6unig...,19,actually cbcd (canadian but came to us @ like ...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Coronavirus (COVID-19) Megathread - News and U...,417,fqqdsg,india,https://www.reddit.com/r/india/comments/fqqdsg...,10799,###[covid-19 fundraisers & donation links](htt...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Why Narendra Modi has taken the big and the ba...,41,exa23n,india,https://theprint.in/opinion/why-narendra-modi-...,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [440]:
reddit_clean = final_df_copy.drop(['title','subreddit','url','body','id','created'],axis = 1)

In [407]:
reddit_clean

Unnamed: 0,score,num_comments,flair,.,n't,stickied,?,...,automatically,cause,apparently,change,anymore,beating,therapy
0,3,3,Scheduled,5.0,1.0,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,24,12,Business/Finance,5.0,3.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,19,AMA,3.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,417,10799,AskIndia,31.0,0.0,0.0,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,41,2,Politics,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5,121,AMA,10.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,408,13,Food,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,250,32,Photography,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,26,6,Scheduled,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,44,3,Science/Technology,6.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [147]:
#reddit_final['flair'] = reddit.loc[0:50,'flair']

In [441]:
reddit_clean['flair'].value_counts()

Photography           21
Science/Technology    20
Business/Finance      20
Sports                19
AskIndia              17
Scheduled             16
AMA                   15
Politics              14
Policy/Economy        13
Non-Political         13
Food                  12
Name: flair, dtype: int64

In [442]:
def convert_output_col(output_col):
    output_col_classes = list(output_col.value_counts().index)
    d = {class_:class_no for (class_no,class_) in enumerate(output_col_classes)}
    return d
    
    

In [443]:
d = convert_output_col(reddit_clean['flair'].value_counts().index)
d

{'AMA': 4,
 'AskIndia': 9,
 'Business/Finance': 10,
 'Food': 6,
 'Non-Political': 5,
 'Photography': 0,
 'Policy/Economy': 3,
 'Politics': 7,
 'Scheduled': 8,
 'Science/Technology': 1,
 'Sports': 2}

In [444]:
reddit_final = reddit_clean.copy()

In [445]:
reddit_final['flair_no'] = reddit_clean['flair'].apply(lambda x:d[x])

In [446]:
reddit_clean.drop('flair',axis = 1,inplace = True)

In [447]:
reddit_final.drop('flair',axis= 1, inplace = True)

In [448]:
from sklearn import preprocessing
reddit_norm = reddit_final.copy()
mms = preprocessing.MinMaxScaler()
reddit_norm[reddit_norm.columns.drop('flair_no')] = mms.fit_transform(reddit_norm[reddit_norm.columns.drop('flair_no')])


In [416]:
reddit_norm

Unnamed: 0,score,num_comments,.,n't,stickied,?,#,...,cause,apparently,change,anymore,beating,therapy,flair_no
0,0.002588,0.000278,0.147059,0.083333,1.0,0.133333,0.020408,...,0.0,0.0,0.0,0.0,0.0,0.0,8
1,0.020708,0.001111,0.147059,0.25,0.0,0.133333,0.040816,...,0.0,0.0,0.0,0.0,0.0,0.0,5
2,0.001726,0.001759,0.088235,0.0,0.0,0.066667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4
3,0.359793,1.0,0.911765,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3
4,0.035375,0.000185,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,7
5,0.004314,0.011205,0.294118,0.083333,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4
6,0.352028,0.001204,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9
7,0.215703,0.002963,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0
8,0.022433,0.000556,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,8
9,0.037964,0.000278,0.176471,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1


In [449]:
reddit_train = split_df(reddit_norm)

In [418]:
reddit_train

Unnamed: 0,score,num_comments,.,n't,stickied,?,#,...,cause,apparently,change,anymore,beating,therapy,flair_no
0,0.002588,0.000278,0.147059,0.083333,1.0,0.133333,0.020408,...,0.0,0.0,0.0,0.0,0.0,0.0,8
1,0.020708,0.001111,0.147059,0.25,0.0,0.133333,0.040816,...,0.0,0.0,0.0,0.0,0.0,0.0,5
2,0.001726,0.001759,0.088235,0.0,0.0,0.066667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4
3,0.359793,1.0,0.911765,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3
4,0.035375,0.000185,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,7
5,0.004314,0.011205,0.294118,0.083333,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4
6,0.352028,0.001204,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9
7,0.215703,0.002963,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0
8,0.022433,0.000556,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,8
9,0.037964,0.000278,0.176471,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1


In [450]:
reddit_test = split_df(reddit_norm,df_to_return='test')

In [451]:
reddit_new_points = split_df(reddit_norm.drop('flair_no',axis = 1),df_to_return='test')

In [421]:
reddit_new_points

Unnamed: 0,score,num_comments,.,n't,stickied,?,#,...,automatically,cause,apparently,change,anymore,beating,therapy
35,0.014668,0.000185,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36,0.322692,0.033429,0.147059,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37,0.005177,0.000648,0.264706,0.0,0.0,0.066667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38,0.064711,0.001945,0.176471,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39,0.003451,0.000185,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40,0.103538,0.003519,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41,0.018119,0.001204,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42,0.007765,9.3e-05,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43,0.03365,0.000648,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44,0.008628,0.000278,0.147059,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [319]:
from time import time

In [452]:
st = time()
reddit_pred = knn(reddit_train,reddit_new_points,'flair_no',k = 17)
et = time()
print("KNN algorithm took %s seconds to run" % (et - st))

KNN algorithm took 1004.9792673587799 seconds to run


In [453]:
reddit_predictions = split_df(reddit_pred,df_to_return='test')

In [454]:
reddit_predictions

Unnamed: 0,score,num_comments,.,n't,stickied,?,#,...,medication,tire,sanctum,apollo,cradle,undertake,flair_no
125,0.024159,0.001296,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5
126,0.040552,0.00037,0.137339,0.0,0.0,0.095238,0.510204,...,0.0,0.0,0.0,0.0,0.0,0.0,6
127,0.059534,0.000556,0.390558,0.272727,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2
128,0.064711,0.006204,0.090129,0.045455,0.0,0.047619,0.020408,...,0.0,0.0,0.0,0.0,0.0,0.0,2
129,0.539258,0.017687,0.038627,0.045455,0.0,0.285714,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2
130,0.044003,0.005278,0.051502,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2
131,0.127696,0.000278,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,7
132,0.112166,0.001667,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,7
133,0.000863,9.3e-05,0.051502,0.0,0.0,0.142857,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0
134,0.015531,0.095379,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5


In [425]:
reddit_test

In [507]:
reddit_final.describe().loc['max'].max()

10799.0

In [455]:
check_accuracy(pred_df = reddit_predictions,test_df=reddit_test,algo = 'knn',target_class = 'flair_no')

0.12727272727272726

In [505]:
print(time())

1592008176.1816792


In [212]:
d = {1:['fjf','djf']}

In [214]:
list(d.keys())

[1]

In [208]:
string = string.split(' ')

In [209]:
freq = nltk.FreqDist(string)

In [210]:
freq

FreqDist({'/r/askmen,': 1,
          'India': 1,
          'and': 2,
          'answers.': 1,
          'because': 1,
          'culturally': 1,
          'different': 1,
          'from': 1,
          'get': 1,
          'here': 1,
          'importantly': 1,
          'is': 1,
          'more': 2,
          'posted': 1,
          'relevant': 1,
          'socially': 1,
          'will': 1,
          'xposted': 1})

In [122]:
d = {10:[1,4,5,6,8,7],0:[1,2,4],1:[2,5,6,3]}

In [128]:
list(d.values())

[[1, 4, 5, 6, 8, 7], [1, 2, 4], [2, 5, 6, 3]]

In [129]:
d_vals = list(d.values())

In [130]:
d_vals

[[1, 4, 5, 6, 8, 7], [1, 2, 4], [2, 5, 6, 3]]

In [133]:
d_vals.sort(key = len)

In [134]:
d_vals

[[1, 2, 4], [2, 5, 6, 3], [1, 4, 5, 6, 8, 7]]

In [255]:
from nltk.stem import WordNetLemmatizer

In [256]:
lemmatizer = WordNetLemmatizer()

In [257]:
lemmatizer.lemmatize('Worried',pos= 'n')

'Worried'

In [155]:
from nltk.corpus import wordnet as wn

In [158]:

words = ['amazing', 'interesting', 'love', 'great', 'nice']
lemmatized_words = []
d = {}
for w in words:
    tmp = wn.synsets(w)[0].pos()
    d[w] = tmp
for tok in words:
    if d[tok] == 'n':
        lemmatized_words.append(lemmatizer.lemmatize(tok,pos = 'n'))
    if d[tok] == 'v':
        lemmatized_words.append(lemmatizer.lemmatize(tok,pos = 'v'))
    if d[tok] == 'a':
        lemmatized_words.append(lemmatizer.lemmatize(tok,pos = 'a'))
    else d[tok] == 'r':
        lemmatized_words.append(lemmatizer.lemmatize(tok,pos = 'r'))

{'amazing': 'v', 'great': 'n', 'interesting': 'v', 'love': 'n', 'nice': 'n'}

In [159]:
wn.synsets('beautiful')[0].pos()

'a'

In [160]:
wn.synsets('beautifully')[0].pos()

'r'

In [163]:
wn.synsets(',')

[]

In [193]:
wn.synsets('/r/askmne')[0].pos()

IndexError: list index out of range

In [192]:
lemmatizer.lemmatize('posted',pos = 'v')

'post'

In [22]:
df = pd.DataFrame({'col1':[1,2,3,4,2,5,4],'col2':[2,5,7,3,8,4,20]})

In [23]:
df.reset_index(inplace = True)

In [24]:
df

Unnamed: 0,index,col1,col2
0,0,1,2
1,1,2,5
2,2,3,7
3,3,4,3
4,4,2,8
5,5,5,4
6,6,4,20


In [17]:
df.assign(**{'col3':3,'col4':4})

Unnamed: 0,index,col1,col2,col3,col4
0,0,1,2,3,4
1,1,2,5,3,4
2,2,3,7,3,4


In [26]:
df.col3.iloc[[0,2]] = 12
#df[['col3','col4']].iloc[0] = df.assign(**{'col3':3,'col4':4})

AttributeError: 'DataFrame' object has no attribute 'col3'

In [25]:
df

Unnamed: 0,index,col1,col2
0,0,12,2
1,1,2,5
2,2,12,7
