## Parameshwari S - CB.SC.I5DAS18026

## Importing required packages

In [33]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Data Exploration

In [34]:
df = pd.read_excel("/content/dataset_1.xlsx")
df.head()

Unnamed: 0,inline-comment-id,# Comment,Question,Question_H,Final Label
0,84326dd1_566c7146,1,is this what they intended? don't they really ...,is this what they intended?,request for confirmation
1,84326dd1_566c7146,2,is this what they intended? don't they really ...,don't they really want $(TARGET_OUT_DATA_NATIV...,surprise
2,99d1f8e4_92b31cea,3,Don't we need to increment 'i' in the else cas...,Don't we need to increment 'i' in the else cas...,suggestion
3,193d089f_f5fac752,4,i can't see anywhere where this is set to fals...,should we just adjust the single reference in ...,suggestion
4,50c2f81e_ac4fd6fc,5,are you sure you want to include this source f...,are you sure you want to include this source f...,criticism


In [35]:
df.shape

(499, 5)

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499 entries, 0 to 498
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   inline-comment-id  499 non-null    object
 1   # Comment          499 non-null    int64 
 2   Question           499 non-null    object
 3   Question_H         499 non-null    object
 4   Final Label        499 non-null    object
dtypes: int64(1), object(4)
memory usage: 19.6+ KB


In [37]:
df['Question_H'].duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
494    False
495    False
496    False
497    False
498    False
Name: Question_H, Length: 499, dtype: bool

In [38]:
df['Final Label'].value_counts()

suggestion                   163
request for confirmation      92
request for information       58
request for rationale         45
request for action            32
criticism                     25
request for clarification     24
request for opinion           21
hypothetical scenario         13
surprise                       7
anger                          6
rhetorical question            6
discarded                      5
action                         2
Name: Final Label, dtype: int64

In [39]:
df = df.replace(to_replace = ["request for confirmation", "request for information","request for rationale",
                                 "request for action","request for clarification","request for opinion","action"],
                    value = "Requests")
df = df.replace(to_replace = ["criticism","anger","surprise"], value = 'Attitudes and emotions')

In [40]:
df['Final Label'].value_counts()

Requests                  274
suggestion                163
Attitudes and emotions     38
hypothetical scenario      13
rhetorical question         6
discarded                   5
Name: Final Label, dtype: int64

In [41]:
df.rename(columns = {'inline-comment-id' : 'Id', '# Comment' : 'Comment'}, inplace = True)

In [42]:
df['Question_H'][4]

'are you sure you want to include this source file directly ?'

In [43]:
df['Question_H'][5]

'Why not create a static library ?'

In [44]:
df['Question_H'][4] == df['Question_H'][5]

False

## Text-Preprocessing

In [45]:
dfc = df.copy()

In [46]:
def remove_special_characters(text):
    pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]'
    return re.sub(pat, '', text)

In [47]:
def removePunctuations(sentence):
    cleaned_text  = re.sub('[^a-zA-Z]',' ',sentence)
    return cleaned_text

In [48]:
def line_breaks(x):
  return x.replace('\n', ' ').replace('\r', '')

In [49]:
def space_num(x):
  x = re.sub("\S*\d\S*", "", x).strip()
  return x

In [50]:
def removeURL(sentence):
    text = re.sub(r"http\S+", " ", sentence)
    sentence = re.sub(r"www.\S+", " ", text)
    return sentence

In [51]:
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

In [52]:
def removePatterns(sentence): 
    cleaned_text  = re.sub("\\s*\\b(?=\\w*(\\w)\\1{2,})\\w*\\b",' ',sentence)
    return (cleaned_text)

In [53]:
def decontracted(phrase):
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [54]:
lemmatizer = WordNetLemmatizer()
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None
def lemmatize_sent(i):
  pos_tagged = nltk.pos_tag(nltk.word_tokenize(i))  
  wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged))
  lemmatized_sentence = []
  for word, tag in wordnet_tagged:
    if tag is None:
        lemmatized_sentence.append(word)
    else:        
        lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
  lemmatized_sentence = " ".join(lemmatized_sentence)
  return lemmatized_sentence

In [55]:
final = []
for text in dfc['Question_H'].values:
  filter = []
  text = remove_special_characters(text)
  text = text.lower()
  text = removePunctuations(text)
  text = line_breaks(text)
  text = removeURL(text)
  text = remove_html(text)
  text = decontracted(text)
  text = space_num(text)
  text = removePatterns(text)
  for cleaned_words in text.split():  
    s = lemmatize_sent(cleaned_words)
    filter.append(s)
  text = " ".join(filter)
  final.append(text)

In [56]:
dfc['Final text'] = final
dfc.head()

Unnamed: 0,Id,Comment,Question,Question_H,Final Label,Final text
0,84326dd1_566c7146,1,is this what they intended? don't they really ...,is this what they intended?,Requests,be this what they intend
1,84326dd1_566c7146,2,is this what they intended? don't they really ...,don't they really want $(TARGET_OUT_DATA_NATIV...,Attitudes and emotions,don t they really want target out data native ...
2,99d1f8e4_92b31cea,3,Don't we need to increment 'i' in the else cas...,Don't we need to increment 'i' in the else cas...,suggestion,don t we need to increment i in the else case ...
3,193d089f_f5fac752,4,i can't see anywhere where this is set to fals...,should we just adjust the single reference in ...,suggestion,should we just adjust the single reference in ...
4,50c2f81e_ac4fd6fc,5,are you sure you want to include this source f...,are you sure you want to include this source f...,Attitudes and emotions,be you sure you want to include this source fi...


##Without adding Feature

##Vectorizing

In [57]:
Tfidf = TfidfVectorizer(ngram_range = (1, 2))
tfd = Tfidf.fit_transform(dfc['Final text'])

##Train-test split

In [58]:
Xtrain, Xtest, ytrain, ytest = train_test_split(tfd, dfc['Final Label'], random_state = 42, test_size = 0.1)

##Evaluation

In [59]:
lr = LogisticRegression(max_iter = 500)
pred = lr.fit(Xtrain,ytrain).predict(Xtest)
print("Logistic Regression - ", accuracy_score(ytest, pred))

svc = svm.SVC()
pred = svc.fit(Xtrain, ytrain).predict(Xtest)
print("Support Vector machine - ", accuracy_score(ytest, pred))

mn = MultinomialNB()
pred = mn.fit(Xtrain, ytrain).predict(Xtest)
print("Naive Bayes - ", accuracy_score(ytest, pred))

Logistic Regression -  0.8
Support Vector machine -  0.74
Naive Bayes -  0.74


##Adding extra Feature - Length of the text

In [60]:
length = []
for i in range(len(dfc)):
  length.append(len(dfc['Question_H'][i].split()))
dfc['Length'] = length
dfc.head()

Unnamed: 0,Id,Comment,Question,Question_H,Final Label,Final text,Length
0,84326dd1_566c7146,1,is this what they intended? don't they really ...,is this what they intended?,Requests,be this what they intend,5
1,84326dd1_566c7146,2,is this what they intended? don't they really ...,don't they really want $(TARGET_OUT_DATA_NATIV...,Attitudes and emotions,don t they really want target out data native ...,5
2,99d1f8e4_92b31cea,3,Don't we need to increment 'i' in the else cas...,Don't we need to increment 'i' in the else cas...,suggestion,don t we need to increment i in the else case ...,16
3,193d089f_f5fac752,4,i can't see anywhere where this is set to fals...,should we just adjust the single reference in ...,suggestion,should we just adjust the single reference in ...,13
4,50c2f81e_ac4fd6fc,5,are you sure you want to include this source f...,are you sure you want to include this source f...,Attitudes and emotions,be you sure you want to include this source fi...,12


##Vectorizing

In [61]:
Tfidf = TfidfVectorizer(ngram_range = (1, 2))
tfd = Tfidf.fit_transform(dfc['Final text'])

In [62]:
x = list(tfd.toarray())
y = dfc['Length'].to_list()
z = []
for i,j in enumerate(x):
  z.append(np.append(x[i], y[i]))
X = np.asarray(z)

##Train-test split

In [63]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, dfc['Final Label'], random_state = 42, test_size = 0.1)

##Evaluation

In [64]:
lr = LogisticRegression(max_iter = 500)
pred = lr.fit(Xtrain,ytrain).predict(Xtest)
print("Logistic Regression - ", accuracy_score(ytest, pred))

svc = svm.SVC()
pred = svc.fit(Xtrain, ytrain).predict(Xtest)
print("Support Vector machine - ", accuracy_score(ytest, pred))

mn = MultinomialNB()
pred = mn.fit(Xtrain, ytrain).predict(Xtest)
print("Naive Bayes - ", accuracy_score(ytest, pred))

Logistic Regression -  0.78
Support Vector machine -  0.72
Naive Bayes -  0.74
