In [3]:

train = pd.read_csv("reddit_train.csv")
train.head()
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sns.set()

## 1 - Open training data

In [4]:
#train_data = "reddit_train.csv"
#train = pd.read_csv(train_data)
train.head()

Unnamed: 0,id,comments,subreddits
0,0,"Honestly, Buffalo is the correct answer. I rem...",hockey
1,1,Ah yes way could have been :( remember when he...,nba
2,2,https://youtu.be/6xxbBR8iSZ0?t=40m49s\n\nIf yo...,leagueoflegends
3,3,He wouldn't have been a bad signing if we woul...,soccer
4,4,Easy. You use the piss and dry technique. Let ...,funny


## 2 - Basic Feature Extraction
 

### Number of Words

In [5]:
train['word_count'] = train['comments'].apply(lambda x: len(str(x).split(" ")))
train.head()

Unnamed: 0,id,comments,subreddits,word_count
0,0,"Honestly, Buffalo is the correct answer. I rem...",hockey,58
1,1,Ah yes way could have been :( remember when he...,nba,29
2,2,https://youtu.be/6xxbBR8iSZ0?t=40m49s\n\nIf yo...,leagueoflegends,18
3,3,He wouldn't have been a bad signing if we woul...,soccer,24
4,4,Easy. You use the piss and dry technique. Let ...,funny,46


### 2.1 Number of Characters

In [30]:
train['char_count'] = train['comments'].str.len() ## this also includes spaces
train.head()
train['id']

0            0
1            1
2            2
3            3
4            4
5            5
6            6
7            7
8            8
9            9
10          10
11          11
12          12
13          13
14          14
15          15
16          16
17          17
18          18
19          19
20          20
21          21
22          22
23          23
24          24
25          25
26          26
27          27
28          28
29          29
         ...  
69970    69970
69971    69971
69972    69972
69973    69973
69974    69974
69975    69975
69976    69976
69977    69977
69978    69978
69979    69979
69980    69980
69981    69981
69982    69982
69983    69983
69984    69984
69985    69985
69986    69986
69987    69987
69988    69988
69989    69989
69990    69990
69991    69991
69992    69992
69993    69993
69994    69994
69995    69995
69996    69996
69997    69997
69998    69998
69999    69999
Name: id, Length: 70000, dtype: int64

### 2.2 Removing the stopwords

In [7]:
# do this if its the first time using nltk stopwords
import nltk
nltk.download('stopwords') 

[nltk_data] Downloading package stopwords to /home/vasu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
from nltk.corpus import stopwords

stop = stopwords.words('english')

train['stopwords'] = train['comments'].apply(lambda x: len([x for x in x.split() if x in stop]))
train.head()

Unnamed: 0,id,comments,subreddits,word_count,char_count,stopwords
0,0,"Honestly, Buffalo is the correct answer. I rem...",hockey,58,357,20
1,1,Ah yes way could have been :( remember when he...,nba,29,145,12
2,2,https://youtu.be/6xxbBR8iSZ0?t=40m49s\n\nIf yo...,leagueoflegends,18,145,9
3,3,He wouldn't have been a bad signing if we woul...,soccer,24,123,12
4,4,Easy. You use the piss and dry technique. Let ...,funny,46,212,17


### 2.3 Does the comment contain a url ?

In [9]:
train['links'] = train['comments'].apply(lambda x: True if "http" in x else False)
train.head()

Unnamed: 0,id,comments,subreddits,word_count,char_count,stopwords,links
0,0,"Honestly, Buffalo is the correct answer. I rem...",hockey,58,357,20,False
1,1,Ah yes way could have been :( remember when he...,nba,29,145,12,False
2,2,https://youtu.be/6xxbBR8iSZ0?t=40m49s\n\nIf yo...,leagueoflegends,18,145,9,True
3,3,He wouldn't have been a bad signing if we woul...,soccer,24,123,12,False
4,4,Easy. You use the piss and dry technique. Let ...,funny,46,212,17,False


### 2.4 Number of Numerics

In [10]:
train['numerics'] = train['comments'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
train.head()

Unnamed: 0,id,comments,subreddits,word_count,char_count,stopwords,links,numerics
0,0,"Honestly, Buffalo is the correct answer. I rem...",hockey,58,357,20,False,1
1,1,Ah yes way could have been :( remember when he...,nba,29,145,12,False,0
2,2,https://youtu.be/6xxbBR8iSZ0?t=40m49s\n\nIf yo...,leagueoflegends,18,145,9,True,0
3,3,He wouldn't have been a bad signing if we woul...,soccer,24,123,12,False,0
4,4,Easy. You use the piss and dry technique. Let ...,funny,46,212,17,False,0


### 2.5 Number of Uppercase words

In [11]:
train['upper'] = train['comments'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
train.head()

Unnamed: 0,id,comments,subreddits,word_count,char_count,stopwords,links,numerics,upper
0,0,"Honestly, Buffalo is the correct answer. I rem...",hockey,58,357,20,False,1,2
1,1,Ah yes way could have been :( remember when he...,nba,29,145,12,False,0,1
2,2,https://youtu.be/6xxbBR8iSZ0?t=40m49s\n\nIf yo...,leagueoflegends,18,145,9,True,0,0
3,3,He wouldn't have been a bad signing if we woul...,soccer,24,123,12,False,0,1
4,4,Easy. You use the piss and dry technique. Let ...,funny,46,212,17,False,0,0


## 3 - Basic Pre-processing

In [12]:
# Keep a copy of the unprocessed training dataframe just in case
unprocessed_train = train.copy()

### 3.1 - Lowercase

In [13]:
train['comments'] = train['comments'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train.head()

Unnamed: 0,id,comments,subreddits,word_count,char_count,stopwords,links,numerics,upper
0,0,"honestly, buffalo is the correct answer. i rem...",hockey,58,357,20,False,1,2
1,1,ah yes way could have been :( remember when he...,nba,29,145,12,False,0,1
2,2,https://youtu.be/6xxbbr8isz0?t=40m49s if you d...,leagueoflegends,18,145,9,True,0,0
3,3,he wouldn't have been a bad signing if we woul...,soccer,24,123,12,False,0,1
4,4,easy. you use the piss and dry technique. let ...,funny,46,212,17,False,0,0


### 3.2 Removing Punctuation

In [14]:
train['comments'] = train['comments'].str.replace('[^\w\s]','')
train.head()

Unnamed: 0,id,comments,subreddits,word_count,char_count,stopwords,links,numerics,upper
0,0,honestly buffalo is the correct answer i remem...,hockey,58,357,20,False,1,2
1,1,ah yes way could have been remember when he w...,nba,29,145,12,False,0,1
2,2,httpsyoutube6xxbbr8isz0t40m49s if you didnt fi...,leagueoflegends,18,145,9,True,0,0
3,3,he wouldnt have been a bad signing if we would...,soccer,24,123,12,False,0,1
4,4,easy you use the piss and dry technique let a ...,funny,46,212,17,False,0,0


### 3.2 Removal of Stop Words

In [15]:
train['comments'] = train['comments'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train.head()

Unnamed: 0,id,comments,subreddits,word_count,char_count,stopwords,links,numerics,upper
0,0,honestly buffalo correct answer remember peopl...,hockey,58,357,20,False,1,2
1,1,ah yes way could remember drafted thought gonn...,nba,29,145,12,False,0,1
2,2,httpsyoutube6xxbbr8isz0t40m49s didnt find alre...,leagueoflegends,18,145,9,True,0,0
3,3,wouldnt bad signing wouldnt paid 18m euros rig...,soccer,24,123,12,False,0,1
4,4,easy use piss dry technique let drops let dry ...,funny,46,212,17,False,0,0


### 3.3 Spelling correction - WARNING: takes a very long time!

In [16]:
!pip install -U textblob
############not running this code because it takes so much time

Collecting textblob
[?25l  Downloading https://files.pythonhosted.org/packages/60/f0/1d9bfcc8ee6b83472ec571406bd0dd51c0e6330ff1a51b2d29861d389e85/textblob-0.15.3-py2.py3-none-any.whl (636kB)
[K     |████████████████████████████████| 645kB 1.8MB/s eta 0:00:01
Installing collected packages: textblob
Successfully installed textblob-0.15.3


from textblob import TextBlob
train['comments'].apply(lambda x: str(TextBlob(x).correct()))
train.head()

## 3.4 tokenizing and lemmatization 

In [17]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')


[nltk_data] Downloading package punkt to /home/vasu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [18]:
train['comments'] = train['comments'].apply(lambda x: word_tokenize(x))


In [19]:
ps = PorterStemmer()

train['comments'] = train['comments'].apply(lambda x:" ".join(ps.stem(word)for word in x))


In [20]:
train.head()

Unnamed: 0,id,comments,subreddits,word_count,char_count,stopwords,links,numerics,upper
0,0,honestli buffalo correct answer rememb peopl s...,hockey,58,357,20,False,1,2
1,1,ah ye way could rememb draft thought gon na gr...,nba,29,145,12,False,0,1
2,2,httpsyoutube6xxbbr8isz0t40m49 didnt find alrea...,leagueoflegends,18,145,9,True,0,0
3,3,wouldnt bad sign wouldnt paid 18m euro right p...,soccer,24,123,12,False,0,1
4,4,easi use piss dri techniqu let drop let dri ri...,funny,46,212,17,False,0,0


In [21]:
targets=np.array(train["subreddits"])
traindata=np.array(train["comments"])
len(targets)
len(traindata)

70000

In [23]:
#######test data


reddit_test = pd.read_csv("reddit_test.csv")
reddit_test.head()
X_test = reddit_test["comments"]

## Building a pipeline
#### vectorizer => transformer => classifier
##### multinomial naive bayes


In [24]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np


from sklearn.pipeline import Pipeline
text_clf = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()),
 ])

text_clf.fit(traindata, targets) 

x=text_clf.predict(X_test)

In [32]:
x=np.array(x)
len(x)
submission = pd.DataFrame({'Id':range(30000),'Category':x})
submission.head()
filename = 'prediction.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)
submission.head()



Saved file: prediction.csv


Unnamed: 0,Id,Category
0,0,baseball
1,1,wow
2,2,soccer
3,3,worldnews
4,4,wow


### Building a pipeline
##### Support vector machine

In [35]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
 ('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
 ('clf', SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None)),
])
text_clf.fit(traindata, targets) 
x=text_clf.predict(X_test)

In [39]:
x=np.array(x)
len(x)
submission = pd.DataFrame({'Id':range(30000),'Category':x})
submission.head()
filename = 'prediction.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)
submission.head(10)


Saved file: prediction.csv


Unnamed: 0,Id,Category
0,0,baseball
1,1,wow
2,2,baseball
3,3,wow
4,4,wow
5,5,Overwatch
6,6,baseball
7,7,anime
8,8,soccer
9,9,wow
