In [14]:
# importing the libraries

import pandas as pd
import re
from nltk.corpus import stopwords

In [2]:
# loading the data

df = pd.read_csv('Scrapper/final.csv', header=None)
df.rename(columns={0: 'comment', 1: 'category'}, inplace=True)

In [5]:
df.category.unique()

array(['positive', 'imperative', 'interrogative', 'other', 'correction',
       'negative'], dtype=object)

In [6]:
df['category_id'] = df['category'].factorize()[0]
df.head()

Unnamed: 0,comment,category,category_id
0,Love you sir!!,positive,0
1,Please make videos on..Midpoint circle drawing...,imperative,1
2,I bought both of your courses on Udemy. You ar...,interrogative,2
3,"Thank you very much, u really got me in the fi...",positive,0
4,i hope u are ok with everything going on again...,other,3


In [7]:
df.isnull().sum()

comment        0
category       0
category_id    0
dtype: int64

In [8]:
df.shape

(1201, 3)

In [11]:
# preprocessing

# lowercasing
df['comment'] = df['comment'].str.lower()

# removing urls
df['comment'] = df['comment'].str.replace('http\S+|www.\S+', '', case=False)

# removing new lines "\n"
df['comment'] = df['comment'].replace('\n','', regex=True)

# removing all the punctuations
df["comment"] = df['comment'].str.replace('[^\w\s]',' ')

# removing integers
df['comment'] = df['comment'].replace('\d','', regex=True)

# removing emojis
df['comment'] = df['comment'].str.replace('[^\w\s#@/:%.,_-]', '', flags=re.UNICODE)
df

Unnamed: 0,comment,category,category_id
0,love you sir,positive,0
1,please make videos on midpoint circle drawing...,imperative,1
2,i bought both of your courses on udemy you ar...,interrogative,2
3,thank you very much u really got me in the fi...,positive,0
4,i hope u are ok with everything going on again...,other,3
...,...,...,...
1196,what is the purpose of writing main function,interrogative,2
1197,your teaching skill is awesome,positive,0
1198,we can use online compiler to run a program t...,interrogative,2
1199,c programs can be compiled in browser and that...,interrogative,2


In [12]:
# stemming
import nltk

stemmer = nltk.stem.SnowballStemmer('english')
def stem_text(text):
    return stemmer.stem(text)

df['comment'] = df['comment'].apply(stem_text)


# lemmatizing
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    return lemmatizer.lemmatize(text)

df['comment'] = df['comment'].apply(lemmatize_text)
df

Unnamed: 0,comment,category,category_id
0,love you sir,positive,0
1,please make videos on midpoint circle drawing...,imperative,1
2,i bought both of your courses on udemy you ar...,interrogative,2
3,thank you very much u really got me in the fi...,positive,0
4,i hope u are ok with everything going on again...,other,3
...,...,...,...
1196,what is the purpose of writing main funct,interrogative,2
1197,your teaching skill is awesome,positive,0
1198,we can use online compiler to run a program t...,interrogative,2
1199,c programs can be compiled in browser and that...,interrogative,2


In [15]:
# removing stopwords
stop = stopwords.words('english')

df['comment'] = df['comment'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df

Unnamed: 0,comment,category,category_id
0,love sir,positive,0
1,please make videos midpoint circle drawing bre...,imperative,1
2,bought courses udemy talented teacher keep pro...,interrogative,2
3,thank much u really got first video algorithm ...,positive,0
4,hope u ok everything going muslims indea,other,3
...,...,...,...
1196,purpose writing main funct,interrogative,2
1197,teaching skill awesome,positive,0
1198,use online compiler run program became standal...,interrogative,2
1199,c programs compiled browser need installation ...,interrogative,2


In [39]:
# feature selection

def features(text):

    wordlist = wordstring.split()
    wordfreq = []
    
    for w in wordlist:
        c = wordlist.count(w)/len(wordlist)
        wordfreq.append(c)

    return wordfreq

wordstring = 'please make videos midpoint circle drawing please'
features(wordstring)

[0.2857142857142857,
 0.14285714285714285,
 0.14285714285714285,
 0.14285714285714285,
 0.14285714285714285,
 0.14285714285714285,
 0.2857142857142857]