In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # visualization
import seaborn as sns # visualizing data

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('../input/pheme-dataset-for-rumour-detection/dataset.csv'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import regex

In [3]:
!pip install nltk
nltk.download('omw-1.4')

[0m

[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


True

In [4]:
df = pd.read_csv('../input/pheme-dataset-for-rumour-detection/dataset.csv')
df.head()

Unnamed: 0,text,is_rumor,user.handle,topic
0,Charlie Hebdo became well known for publishing...,0.0,BBCDanielS,charliehebdo
1,"Now 10 dead in a shooting there today RT ""@BBC...",0.0,robbylevy,charliehebdo
2,@BBCDanielS @BBCWorld I'm guessing this is bei...,0.0,ModerateInAll,charliehebdo
3,@BBCDanielS @BBCWorld why would you mention th...,0.0,GabTarquini,charliehebdo
4,@BBCDanielS @BBCWorld perps identified?,0.0,freethought41,charliehebdo


In [5]:
df.shape

(62445, 4)

In [6]:
df.dtypes

text            object
is_rumor       float64
user.handle     object
topic           object
dtype: object

In [7]:
df.describe()

Unnamed: 0,is_rumor
count,62443.0
mean,0.221386
std,0.415183
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [8]:
df.isnull().sum()

text               0
is_rumor           2
user.handle        2
topic          12777
dtype: int64

In [9]:
df.dropna(axis=0, inplace=True)

In [10]:
df.isnull().sum()

text           0
is_rumor       0
user.handle    0
topic          0
dtype: int64

In [11]:
df.duplicated().sum()

795

In [12]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [13]:
df['text'][40]

'Charlie Hebdo’s Last Tweet Before Shootings http://t.co/9Oa2xAqOcM http://t.co/skJHNEQcn0'

In [14]:
df['is_rumor'].value_counts()

0.0    42590
1.0     6281
Name: is_rumor, dtype: int64

In [15]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
df['is_rumor'] = enc.fit_transform(df['is_rumor'])

In [16]:
df['is_rumor'].value_counts()

0    42590
1     6281
Name: is_rumor, dtype: int64

In [17]:
# Stopwords Removal 
sw = stopwords.words('english')
print(sw)
lm = WordNetLemmatizer()

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [18]:
text_msg = []
for i in df['text']:
    t = regex.sub('[^A-Za-z0-9]',' ',i)    # Removal of Punctuations
    t = t.lower()                       # Conversion to lowercase
    t = word_tokenize(t)                # word_tokenization
    t = [i for i in t if i not in sw]   # stopwords removal
    t = [lm.lemmatize(i) for i in t]    # Lemmatization
    t = " ".join(t)
    text_msg.append(t)

In [19]:
print(text_msg[:10])

['charlie hebdo became well known publishing muhammed cartoon two year ago', '10 dead shooting today rt bbcdaniels charlie hebdo became well known publishing muhammed cartoon two year ago', 'bbcdaniels bbcworld guessing considered terrorism right lone wolf', 'bbcdaniels bbcworld would mention knowing fact islamphobiaatitsbest', 'bbcdaniels bbcworld perps identified', 'bbcdaniels bbcworld charlie hebdo', 'im 5h voter french satirical magazine', 'gabtarquini bbcdaniels bbcworld maybe shouted prophet avenged thus making relevant story', 'jakobsen bbcdaniels bbcworld source heard reported yet', 'gabtarquini bbcdaniels bbcworld several men black cagoule heard shout prophet avenged wrote pierre de']


In [20]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2000) ### Here 2000 implies length of the sentence.
sm = cv.fit_transform(text_msg).toarray()  ### sparse_matrix
print(len(cv.get_feature_names_out()))
print(sm.shape)

2000
(48871, 2000)


In [21]:
print(len(sm[0]))
print(len(sm[2342]))
print(len(sm[3453]))
print(len(sm[876]))

2000
2000
2000
2000


In [22]:
x = sm              # independent variable
y = df['is_rumor']  # target variable
print(type(x))
print(type(y))

<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>


In [23]:
print(sm[:4])

[[0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [24]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(36653, 2000)
(12218, 2000)
(36653,)
(12218,)


In [25]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB
m1 = MultinomialNB()
m1.fit(x_train,y_train)

MultinomialNB()

In [26]:
# Accuracy
print('Training Score', m1.score(x_train,y_train))
print('Testing Score', m1.score(x_test,y_test))

Training Score 0.9210432979565111
Testing Score 0.9150433786217057


In [27]:
ypred_m1 = m1.predict(x_test)
print(ypred_m1)

[1 1 0 ... 0 0 0]


In [28]:
m2 = GaussianNB()
m2.fit(x_train, y_train)

GaussianNB()

In [29]:
# Accuracy
print('Training Score', m2.score(x_train,y_train))
print('Testing Score', m2.score(x_test,y_test))

Training Score 0.6404387089733447
Testing Score 0.6371746603372074


In [30]:
ypred_m2 = m2.predict(x_test)
print(ypred_m2)

[1 1 0 ... 0 0 0]


In [31]:
from sklearn.ensemble import RandomForestClassifier
m3 = RandomForestClassifier()
m3.fit(x_train,y_train)

RandomForestClassifier()

In [32]:
# Accuracy
print('Training Score', m3.score(x_train,y_train))
print('Testing Score', m3.score(x_test,y_test))

Training Score 0.9954437563091698
Testing Score 0.9286298903257489


In [33]:
# KNeighbours Classifier
from sklearn.neighbors import KNeighborsClassifier
m4 = KNeighborsClassifier(n_neighbors = 1)
m4.fit(x_train,y_train)
print(m4.score(x_train,y_train))
print(m4.score(x_test,y_test))

0.9943524404550788
0.8615976428220659


In [34]:
# Decision Tree
from sklearn import tree
m5 = tree.DecisionTreeClassifier()
m5.fit(x_train, y_train)
print('Training_score', m5.score(x_train, y_train))
print('Testing_score', m5.score(x_test, y_test))

Training_score 0.995471039205522
Testing_score 0.9042396464233099


In [35]:
# Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
m6 = GradientBoostingClassifier()
m6.fit(x_train, y_train)
print('Training_score', m6.score(x_train, y_train))
print('Testing_score', m6.score(x_test, y_test))

Training_score 0.9059831391700542
Testing_score 0.9030119495825831


In [36]:
from sklearn.ensemble import VotingClassifier
vc = VotingClassifier(estimators=[('mnb', m1), ('gnb', m2), ('rf', m3), ('knn', m4), ('dt', m5), ('gb', m6)], voting='hard')
vc.fit(x_train, y_train)
final_pred = print('Testing score', vc.score(x_test, y_test))

Testing score 0.9327222131281715


In [37]:
predictions = vc.predict(x_test)
print(predictions[:100])

[1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]


In [38]:
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test, predictions)
print(cm)
print(classification_report(y_test,predictions))

[[10490   144]
 [  678   906]]
              precision    recall  f1-score   support

           0       0.94      0.99      0.96     10634
           1       0.86      0.57      0.69      1584

    accuracy                           0.93     12218
   macro avg       0.90      0.78      0.83     12218
weighted avg       0.93      0.93      0.93     12218

