In [26]:
import numpy as np
import pandas as pd
from nltk import pos_tag
from nltk.tokenize import word_tokenize
import re
from nltk.corpus import stopwords

## Importing the Data

In [2]:
# importing dataset
df = pd.read_csv('Scrapper/comments.csv', header=None)
df.rename(columns={0: 'comments'}, inplace=True)


## Describing the Data

In [3]:
df.head()

Unnamed: 0,comments
0,Love you sir!!\n
1,Please make videos on..\nMidpoint circle drawi...
2,I bought both of your courses on Udemy. You ar...
3,"Thank you very much, u really got me in the fi..."
4,i hope u are ok with everything going on again...


In [4]:
# dimension of the dataset
df.shape

(2018, 1)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2018 entries, 0 to 2017
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   comments  2018 non-null   object
dtypes: object(1)
memory usage: 15.9+ KB


In [6]:
df.describe()

Unnamed: 0,comments
count,2018
unique,1943
top,Thank you sir\n
freq,10


## Preprocessing

In [7]:
# checking for NaN values
df.isnull().sum()

comments    0
dtype: int64

There is no **NaN** value in the dataset.

In [8]:
# checking for duplicate 
df.duplicated().any()

True

Duplicate row is present. Let's remove the duplicate rows.

In [9]:
# removing duplicate rows
df = df.drop_duplicates()
df.duplicated().any()

False

Duplicated removed succefully.

In [10]:
# lowercasing
df['comments'] = df['comments'].str.lower()
df['comments']

0                                        love you sir!!\n
1       please make videos on..\nmidpoint circle drawi...
2       i bought both of your courses on udemy. you ar...
3       thank you very much, u really got me in the fi...
4       i hope u are ok with everything going on again...
                              ...                        
2013                       excellent  , really helpfull\n
2014    rabin-karp is also useful for matching multipl...
2015                                   koma lecture sir\n
2016    i think sir u r working in algorithm its great...
2017             thank  u sir for wonderful explanation\n
Name: comments, Length: 1943, dtype: object

In [11]:
# removing URLs
df['comments'] = df['comments'].str.replace('http\S+|www.\S+', '', case=False)
df

Unnamed: 0,comments
0,love you sir!!\n
1,please make videos on..\nmidpoint circle drawi...
2,i bought both of your courses on udemy. you ar...
3,"thank you very much, u really got me in the fi..."
4,i hope u are ok with everything going on again...
...,...
2013,"excellent , really helpfull\n"
2014,rabin-karp is also useful for matching multipl...
2015,koma lecture sir\n
2016,i think sir u r working in algorithm its great...


In [16]:
# removing new lines "\n"
df['comments'] = df['comments'].replace('\n','', regex=True)
df

Unnamed: 0,comments
0,love you sir!!
1,please make videos on..midpoint circle drawing...
2,i bought both of your courses on udemy. you ar...
3,"thank you very much, u really got me in the fi..."
4,i hope u are ok with everything going on again...
...,...
2013,"excellent , really helpfull"
2014,rabin-karp is also useful for matching multipl...
2015,koma lecture sir
2016,i think sir u r working in algorithm its great...


In [19]:
# tokenizing


0                  [(love, NN), (you, PRP), (sir!!, VBP)]
1       [(please, VB), (make, VB), (videos, JJ), (on.....
2       [(i, NN), (bought, VBD), (both, DT), (of, IN),...
3       [(thank, NN), (you, PRP), (very, RB), (much,, ...
4       [(i, NNS), (hope, VBP), (u, NNS), (are, VBP), ...
                              ...                        
2013    [(excellent, NN), (,, ,), (really, RB), (helpf...
2014    [(rabin-karp, NN), (is, VBZ), (also, RB), (use...
2015               [(koma, NN), (lecture, NN), (sir, NN)]
2016    [(i, NN), (think, VBP), (sir, NN), (u, JJ), (r...
2017    [(thank, NN), (u, JJ), (sir, NN), (for, IN), (...
Name: comments, Length: 1943, dtype: object

In [None]:
# POS tagging
tagged_df = df['comments'].str.split().map(pos_tag)
tagged_df

In [30]:
# # removing stopwords
# stop = stopwords.words('english')
# df['comments'].apply(lambda x: [item for item in x if item not in stop])

In [None]:
# df.to_csv('file_name.csv', sep='\t')