In [None]:
import csv
import pandas as pd

def extract_data(path):
  return pd.read_csv(path, sep = '\t', quoting=csv.QUOTE_NONE) 

In [None]:
df = extract_data('/content/drive/MyDrive/msrp_project/MSRParaphraseCorpus/msr_paraphrase_train.txt')
df.head()

Unnamed: 0,Quality,#1 ID,#2 ID,#1 String,#2 String
0,1,702876,702977,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi..."
1,0,2108705,2108831,Yucaipa owned Dominick's before selling the ch...,Yucaipa bought Dominick's in 1995 for $693 mil...
2,1,1330381,1330521,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an..."
3,0,3344667,3344648,"Around 0335 GMT, Tab shares were up 19 cents, ...","Tab shares jumped 20 cents, or 4.6%, to set a ..."
4,1,1236820,1236712,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...


In [None]:
df.drop(['#1 ID', '#2 ID'], axis=1,inplace=True)
df.rename(columns={'#1 String':'String_1','#2 String':'String_2'},inplace=True)
df.head()

Unnamed: 0,Quality,String_1,String_2
0,1,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi..."
1,0,Yucaipa owned Dominick's before selling the ch...,Yucaipa bought Dominick's in 1995 for $693 mil...
2,1,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an..."
3,0,"Around 0335 GMT, Tab shares were up 19 cents, ...","Tab shares jumped 20 cents, or 4.6%, to set a ..."
4,1,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...


In [None]:
df["merge"] = df[["String_1", "String_2"]].apply("-".join, axis=1)
df['merge'][0]

'Amrozi accused his brother, whom he called "the witness", of deliberately distorting his evidence.-Referring to him as only "the witness", Amrozi accused his brother of deliberately distorting his evidence.'

In [None]:
def preproc(data,col):
  import nltk
  from nltk.corpus import stopwords
  nltk.download('stopwords')
  from nltk.stem import WordNetLemmatizer
  nltk.download('wordnet')
  nltk.download('omw-1.4')
  import re

  lemm = WordNetLemmatizer()
  corpus = []
  for i in range(len(data)):
    review = re.sub("[^a-zA-Z0-9]"," ",data[col][i]).lower().split()
    review = [lemm.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    corpus.append(" ".join(review))
  return corpus


In [None]:
string1_preproc = preproc(df,'String_1')
string2_preproc = preproc(df,'String_2')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
string_merge = preproc(df,'merge')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
y = df['Quality']

In [None]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500,binary=True)
X = cv.fit_transform(string_merge).toarray()

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.15, random_state=42)

from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB().fit(X_train, y_train)

y_pred = mnb.predict(X_test)

from sklearn.metrics import accuracy_score,classification_report
score = accuracy_score(y_test,y_pred)
print(score)
print(classification_report(y_test,y_pred))

0.7107843137254902
              precision    recall  f1-score   support

           0       0.46      0.39      0.42       165
           1       0.79      0.83      0.81       447

    accuracy                           0.71       612
   macro avg       0.62      0.61      0.62       612
weighted avg       0.70      0.71      0.70       612



In [None]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=2500)
X_tf = tfidf.fit_transform(string_merge).toarray()

from sklearn.model_selection import train_test_split
X_train_tf, X_test_tf, y_train_tf, y_test_tf = train_test_split(X_tf,y, test_size=0.15, random_state=42)

from sklearn.naive_bayes import MultinomialNB
mnb_tf = MultinomialNB().fit(X_train_tf, y_train_tf)

y_pred_tf = mnb_tf.predict(X_test_tf)

from sklearn.metrics import accuracy_score,classification_report
score = accuracy_score(y_test_tf,y_pred_tf)
print(score)
print(classification_report(y_test_tf,y_pred_tf))

0.738562091503268
              precision    recall  f1-score   support

           0       0.54      0.21      0.30       165
           1       0.76      0.94      0.84       447

    accuracy                           0.74       612
   macro avg       0.65      0.57      0.57       612
weighted avg       0.70      0.74      0.69       612



In [None]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=2500)
X_tf = tfidf.fit_transform(string_merge).toarray()

from sklearn.model_selection import train_test_split
X_train_tf, X_test_tf, y_train_tf, y_test_tf = train_test_split(X_tf,y, test_size=0.15, random_state=42)

from sklearn.ensemble import RandomForestClassifier
rf_tf = RandomForestClassifier().fit(X_train_tf, y_train_tf)

y_pred_tf = rf_tf.predict(X_test_tf)

from sklearn.metrics import accuracy_score,classification_report
score = accuracy_score(y_test_tf,y_pred_tf)
print(score)
print(classification_report(y_test_tf,y_pred_tf))

0.7173202614379085
              precision    recall  f1-score   support

           0       0.45      0.24      0.32       165
           1       0.76      0.89      0.82       447

    accuracy                           0.72       612
   macro avg       0.61      0.57      0.57       612
weighted avg       0.68      0.72      0.69       612

