## Import Necessary Libraries

In [1]:
import pandas as pd 
import numpy as np 
import nltk

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn import metrics

## Read the Input Files

In [2]:
train=pd.read_csv("Train_Data.csv")
test=pd.read_csv("Test_Data.csv")
sample=pd.read_csv("Sample_Submission.csv")

train.shape,test.shape,sample.shape

((44262, 2), (11066, 1), (11066, 1))

In [3]:
train.head(3)

Unnamed: 0,headline,is_sarcastic
0,supreme court votes 7-2 to legalize all worldl...,1
1,hungover man horrified to learn he made dozens...,1
2,emily's list founder: women are the 'problem s...,0


In [4]:
test.head(3)

Unnamed: 0,headline
0,area stand-up comedian questions the deal with...
1,dozens of glowing exit signs mercilessly taunt...
2,perfect response to heckler somewhere in prop ...


In [5]:
sample.head(3)

Unnamed: 0,prediction
0,1
1,1
2,1


In [6]:
#Checking the null values in Test data set
train.isna().sum()

headline        0
is_sarcastic    0
dtype: int64

In [7]:
#Checking the "Is_sarcastic" values 
print(train["is_sarcastic"].value_counts()),print("-"*50),print(train["is_sarcastic"].value_counts()/train.shape[0]*100)

0    23958
1    20304
Name: is_sarcastic, dtype: int64
--------------------------------------------------
0    54.127694
1    45.872306
Name: is_sarcastic, dtype: float64


(None, None, None)

In [8]:
#Splitting the Train into xtrain,ytrain,xtest,ytest
X=train['headline']
y=train["is_sarcastic"]
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42)

In [9]:
for i in [X_train, X_test, y_train, y_test]:
    print(i.shape)

(30983,)
(13279,)
(30983,)
(13279,)


In [10]:
#Creating a Pipeline for easier understanding and hygenic flow of Operations
#We Vectorize the words present in "headline" and we get tfidf values
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LogisticRegression()),
                     ])



In [11]:
#Training the model on X_train,y_train
text_clf = text_clf.fit(X_train, y_train)

In [12]:
#Predicting the "is_sarcastic" on X_test
predicted = text_clf.predict(X_test)

## Accuracy of the Model

In [13]:
print('accuracy_score',metrics.accuracy_score(y_test,predicted))
print('accuracy_score',metrics.accuracy_score(y_test,predicted)*100)

accuracy_score 0.8772497929060923
accuracy_score 87.72497929060923


- 88% of the predicted classes were Classified correctly

In [14]:
# Confusion Matrix to give even more idea about the Predicted Output
print(metrics.classification_report(y_test, predicted))
pd.DataFrame(metrics.confusion_matrix(y_test, predicted))

              precision    recall  f1-score   support

           0       0.89      0.88      0.89      7119
           1       0.87      0.87      0.87      6160

    accuracy                           0.88     13279
   macro avg       0.88      0.88      0.88     13279
weighted avg       0.88      0.88      0.88     13279



Unnamed: 0,0,1
0,6298,821
1,809,5351


- 88% of the 0s were Classified as 0
- 89% of the Classified 0s were right
- 87% of the 0s were Classified as 1
- 87% of the Classified 1s were right

## Predicting on Test Data

In [15]:
testdata_predicted=text_clf.predict(test["headline"])

In [16]:
len(testdata_predicted),len(test)

(11066, 11066)

In [17]:
sample["prediction"]=pd.DataFrame(testdata_predicted)

In [18]:
sample.to_csv("Prediction.csv")