## Parameshwari S - CB.SC.I5DAS18026

## Importing required packages

In [1]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Data Exploration

In [2]:
df = pd.read_excel("/content/dataset.xlsx")
df.head()

Unnamed: 0,inline-comment-id,# Comment,Question,Final Label
0,84326dd1_566c7146,1,is this what they intended? don't they really ...,request for confirmation
1,84326dd1_566c7146,2,is this what they intended? don't they really ...,surprise
2,99d1f8e4_92b31cea,3,Don't we need to increment 'i' in the else cas...,suggestion
3,193d089f_f5fac752,4,i can't see anywhere where this is set to fals...,suggestion
4,50c2f81e_ac4fd6fc,5,are you sure you want to include this source f...,criticism


In [3]:
df.shape

(499, 4)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499 entries, 0 to 498
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   inline-comment-id  499 non-null    object
 1   # Comment          499 non-null    int64 
 2   Question           499 non-null    object
 3   Final Label        499 non-null    object
dtypes: int64(1), object(3)
memory usage: 15.7+ KB


In [5]:
df['Question'].duplicated()

0      False
1       True
2      False
3      False
4      False
       ...  
494     True
495    False
496     True
497    False
498    False
Name: Question, Length: 499, dtype: bool

In [6]:
df['Final Label'].value_counts()

suggestion                   163
request for confirmation      92
request for information       58
request for rationale         45
request for action            32
criticism                     25
request for clarification     24
request for opinion           21
hypothetical scenario         13
surprise                       7
rhetorical question            6
anger                          6
discarded                      5
action                         2
Name: Final Label, dtype: int64

In [7]:
df = df.replace(to_replace = ["request for confirmation", "request for information","request for rationale",
                                 "request for action","request for clarification","request for opinion","action"],
                    value = "Requests")
df = df.replace(to_replace = ["criticism","anger","surprise"], value = 'Attitudes and emotions')

In [8]:
df['Final Label'].value_counts()

Requests                  274
suggestion                163
Attitudes and emotions     38
hypothetical scenario      13
rhetorical question         6
discarded                   5
Name: Final Label, dtype: int64

In [9]:
df.rename(columns = {'inline-comment-id' : 'Id', '# Comment' : 'Comment'}, inplace = True)

In [10]:
df['Question'][4]

'are you sure you want to include this source file directly ? Why not create a static library ?\n'

In [11]:
df['Question'][5]

'are you sure you want to include this source file directly ? Why not create a static library ?\n'

In [12]:
df['Question'][4] == df['Question'][5]

True

## Text-Preprocessing

In [13]:
dfc = df.copy()

In [14]:
def remove_special_characters(text):
    pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]'
    return re.sub(pat, '', text)

In [15]:
def removePunctuations(sentence):
    cleaned_text  = re.sub('[^a-zA-Z]',' ',sentence)
    return cleaned_text

In [16]:
def line_breaks(x):
  return x.replace('\n', ' ').replace('\r', '')

In [17]:
def space_num(x):
  x = re.sub("\S*\d\S*", "", x).strip()
  return x

In [18]:
def removeURL(sentence):
    text = re.sub(r"http\S+", " ", sentence)
    sentence = re.sub(r"www.\S+", " ", text)
    return sentence

In [19]:
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

In [20]:
def removePatterns(sentence): 
    cleaned_text  = re.sub("\\s*\\b(?=\\w*(\\w)\\1{2,})\\w*\\b",' ',sentence)
    return (cleaned_text)

In [21]:
def decontracted(phrase):
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [22]:
lemmatizer = WordNetLemmatizer()
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None
def lemmatize_sent(i):
  pos_tagged = nltk.pos_tag(nltk.word_tokenize(i))  
  wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged))
  lemmatized_sentence = []
  for word, tag in wordnet_tagged:
    if tag is None:
        lemmatized_sentence.append(word)
    else:        
        lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
  lemmatized_sentence = " ".join(lemmatized_sentence)
  return lemmatized_sentence

In [23]:
final = []
for text in dfc['Question'].values:
  filter = []
  text = remove_special_characters(text)
  text = text.lower()
  text = removePunctuations(text)
  text = line_breaks(text)
  text = removeURL(text)
  text = remove_html(text)
  text = decontracted(text)
  text = space_num(text)
  text = removePatterns(text)
  for cleaned_words in text.split():  
    s = lemmatize_sent(cleaned_words)
    filter.append(s)
  text = " ".join(filter)
  final.append(text)

In [24]:
dfc['Final text'] = final
dfc.head()

Unnamed: 0,Id,Comment,Question,Final Label,Final text
0,84326dd1_566c7146,1,is this what they intended? don't they really ...,Requests,be this what they intend don t they really wan...
1,84326dd1_566c7146,2,is this what they intended? don't they really ...,Attitudes and emotions,be this what they intend don t they really wan...
2,99d1f8e4_92b31cea,3,Don't we need to increment 'i' in the else cas...,suggestion,don t we need to increment i in the else case ...
3,193d089f_f5fac752,4,i can't see anywhere where this is set to fals...,suggestion,i can t see anywhere where this be set to fals...
4,50c2f81e_ac4fd6fc,5,are you sure you want to include this source f...,Attitudes and emotions,be you sure you want to include this source fi...


##Without adding Feature

##Vectorizing

In [25]:
Tfidf = TfidfVectorizer(ngram_range = (1, 2))
tfd = Tfidf.fit_transform(dfc['Final text'])

##Train-test split

In [26]:
Xtrain, Xtest, ytrain, ytest = train_test_split(tfd, dfc['Final Label'], random_state = 42, test_size = 0.1)

##Evaluation

In [27]:
lr = LogisticRegression(max_iter = 500)
pred = lr.fit(Xtrain,ytrain).predict(Xtest)
print("Logistic Regression - ", accuracy_score(ytest, pred))

svc = svm.SVC()
pred = svc.fit(Xtrain, ytrain).predict(Xtest)
print("Support Vector machine - ", accuracy_score(ytest, pred))

mn = MultinomialNB()
pred = mn.fit(Xtrain, ytrain).predict(Xtest)
print("Naive Bayes - ", accuracy_score(ytest, pred))

Logistic Regression -  0.74
Support Vector machine -  0.7
Naive Bayes -  0.72


##Adding extra Feature - Length of the text

In [28]:
length = []
for i in range(len(dfc)):
  length.append(len(dfc['Question'][i].split()))
dfc['Length'] = length
dfc.head()

Unnamed: 0,Id,Comment,Question,Final Label,Final text,Length
0,84326dd1_566c7146,1,is this what they intended? don't they really ...,Requests,be this what they intend don t they really wan...,10
1,84326dd1_566c7146,2,is this what they intended? don't they really ...,Attitudes and emotions,be this what they intend don t they really wan...,10
2,99d1f8e4_92b31cea,3,Don't we need to increment 'i' in the else cas...,suggestion,don t we need to increment i in the else case ...,16
3,193d089f_f5fac752,4,i can't see anywhere where this is set to fals...,suggestion,i can t see anywhere where this be set to fals...,23
4,50c2f81e_ac4fd6fc,5,are you sure you want to include this source f...,Attitudes and emotions,be you sure you want to include this source fi...,19


##Vectorizing

In [29]:
Tfidf = TfidfVectorizer(ngram_range = (1, 2))
tfd = Tfidf.fit_transform(dfc['Final text'])

In [30]:
x = list(tfd.toarray())
y = dfc['Length'].to_list()
z = []
for i,j in enumerate(x):
  z.append(np.append(x[i], y[i]))
X = np.asarray(z)

##Train-test split

In [31]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, dfc['Final Label'], random_state = 42, test_size = 0.1)

##Evaluation

In [32]:
lr = LogisticRegression(max_iter = 500)
pred = lr.fit(Xtrain,ytrain).predict(Xtest)
print("Logistic Regression - ", accuracy_score(ytest, pred))

svc = svm.SVC()
pred = svc.fit(Xtrain, ytrain).predict(Xtest)
print("Support Vector machine - ", accuracy_score(ytest, pred))

mn = MultinomialNB()
pred = mn.fit(Xtrain, ytrain).predict(Xtest)
print("Naive Bayes - ", accuracy_score(ytest, pred))

Logistic Regression -  0.68
Support Vector machine -  0.56
Naive Bayes -  0.74
