In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer  
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\khoa0\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

# Loading data

In [4]:
news_df = pd.read_csv('train.csv')

In [5]:
news_df.shape

(20800, 5)

In [7]:
news_df = pd.read_csv('train.csv', index_col='id')

In [8]:
news_df.head()

Unnamed: 0_level_0,title,author,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [10]:
news_df.isnull().sum()

title      558
author    1957
text        39
label        0
dtype: int64

In [16]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    20800 non-null  object
 1   author   20800 non-null  object
 2   text     20800 non-null  object
 3   label    20800 non-null  int64 
 4   content  20800 non-null  object
dtypes: int64(1), object(4)
memory usage: 975.0+ KB


In [12]:
# Thay null values with empty string
news_df = news_df.fillna('')

In [37]:
news_df['content'] = news_df['author']+' '+news_df['text']

In [38]:
X = news_df.drop(columns='label', axis=1)
y= news_df['label']

In [39]:
X.head()

Unnamed: 0_level_0,title,author,text,content
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,Daniel J. Flynn Ever get the feeling your life...
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",Consortiumnews.com Why the Truth Might Get You...
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,Jessica Purkiss Videos 15 Civilians Killed In ...
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,Howard Portnoy Print \nAn Iranian woman has be...


# Stemming: is the reduce a word
## VD: actor, actress , acting - act

In [40]:
port_stem = PorterStemmer()

In [41]:
def stemming(content):
    stemming_content = re.sub('[^a-zA-Z]', ' ',content)
    stemming_content = stemming_content.lower()
    stemming_content = stemming_content.split()
    stemming_content = [port_stem.stem(word) for word in stemming_content if not word in stopwords.words('english')]
    stemming_content = ' '.join(stemming_content)
    return stemming_content

In [42]:
news_df['content'] = news_df['content'].apply(stemming)

In [156]:
news_df['content'].head()

id
0    darrel lucu hous dem aid even see comey letter...
1    daniel j flynn ever get feel life circl rounda...
2    consortiumnew com truth might get fire octob t...
3    jessica purkiss video civilian kill singl us a...
4    howard portnoy print iranian woman sentenc six...
Name: content, dtype: object

In [157]:
X = news_df['content'].values
y = news_df['label'].values

In [158]:
print(y)

[1 0 1 ... 0 1 1]


In [159]:
vector = TfidfVectorizer()
vector.fit_transform(X)

X = vector.transform(X)

In [160]:
print(X)

  (0, 325)	0.045667130650001546
  (0, 521)	0.024137067857825937
  (0, 634)	0.040655263159856764
  (0, 872)	0.016532220885492072
  (0, 921)	0.01722048364790269
  (0, 1297)	0.0221512648103483
  (0, 1608)	0.018872892700164016
  (0, 1883)	0.10135738635191603
  (0, 3017)	0.04897982615007314
  (0, 3037)	0.019964670038636077
  (0, 3372)	0.012420901291132562
  (0, 3734)	0.032425888787420115
  (0, 4147)	0.01753538746404485
  (0, 4250)	0.02041371860891464
  (0, 4296)	0.02774983547853985
  (0, 4577)	0.018585592742346543
  (0, 4760)	0.02724786310672941
  (0, 4810)	0.04374300558451042
  (0, 4827)	0.015121911197501224
  (0, 6405)	0.019162012143130215
  (0, 6947)	0.021688127375720546
  (0, 8535)	0.022721472634340944
  (0, 9048)	0.014973921240518988
  (0, 10821)	0.04993524735197602
  (0, 12223)	0.029276217541933377
  :	:
  (20799, 106602)	0.036578887570993375
  (20799, 106735)	0.049173130101967055
  (20799, 107054)	0.01866563127818435
  (20799, 107087)	0.04310831703624315
  (20799, 107410)	0.017413730

# Train test split

In [161]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=2025)

In [162]:
log_reg  =  LogisticRegression(solver='liblinear', max_iter=1000,random_state=2025)

In [163]:
log_reg.fit(X_train, y_train)

# Model Evaluation

In [164]:
y_pred = log_reg.predict(X_train)
training_data_acc = accuracy_score(y_pred, y_train)
print('Accuracy score of training data :', training_data_acc)

Accuracy score of training data : 0.9759014423076923


In [165]:
y_pred_test = log_reg.predict(X_valid)
valid_data_acc = accuracy_score(y_pred_test, y_valid)
print('Accuracy score of valid data :', valid_data_acc)

Accuracy score of valid data : 0.9466346153846154


# Making a Predictive System

In [166]:
X_news = X_valid[3]

prediction = log_reg.predict(X_news)
print(prediction)

if prediction[0] == 0:
    print('The news is real')
else:
    print('The news is fake')

[1]
The news is fake


In [167]:
print(y_valid[3])

1


# Test model submit kaggle

In [115]:
test_df = pd.read_csv('test.csv', index_col='id')

In [116]:
test_df.head()

Unnamed: 0_level_0,title,author,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [117]:
test_df.shape

(5200, 3)

In [121]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5200 entries, 20800 to 25999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   5200 non-null   object
 1   author  5200 non-null   object
 2   text    5200 non-null   object
dtypes: object(3)
memory usage: 162.5+ KB


In [119]:
test_df.isnull().sum()

title     122
author    503
text        7
dtype: int64

In [120]:
test_df = test_df.fillna('')

In [122]:
test_df['content'] = test_df['author']+' '+test_df['text']

In [123]:
test_df['content'] = test_df['content'].apply(stemming)

In [169]:
test_df['content'].head()

id
20800    david streitfeld palo alto calif year scorn po...
20801    russian warship readi strike terrorist near al...
20802    common dream video nodapl nativ american leade...
20803    daniel victor first succeed tri differ sport t...
20804    truth broadcast network min ago view comment l...
Name: content, dtype: object

In [170]:
X_test = test_df['content'].values

In [171]:
X_test = vector.transform(X_test)

In [172]:
print(X_test)

  (0, 634)	0.011378752543269267
  (0, 773)	0.019048910914677836
  (0, 872)	0.02776261214303427
  (0, 886)	0.029612582198597616
  (0, 902)	0.019238018969263172
  (0, 972)	0.011940035511555184
  (0, 1156)	0.024831957088753766
  (0, 1172)	0.02782713941762666
  (0, 1503)	0.023981121884623143
  (0, 1661)	0.03020909113896654
  (0, 1726)	0.016580341839038656
  (0, 1954)	0.03549476824493815
  (0, 2649)	0.022538832047135256
  (0, 2974)	0.015074358026426947
  (0, 3037)	0.0167633675682592
  (0, 3114)	0.03624813198258658
  (0, 3167)	0.014885424714640764
  (0, 3937)	0.022219923158767356
  (0, 4147)	0.014723616515713286
  (0, 4194)	0.011606701688467249
  (0, 4610)	0.039166881524962914
  (0, 4893)	0.01741967228117591
  (0, 5501)	0.012107510819017654
  (0, 5915)	0.024412793446293476
  (0, 5981)	0.026439583238367367
  :	:
  (5199, 106203)	0.031883526655810554
  (5199, 106309)	0.055543501546657034
  (5199, 106403)	0.015307460958137642
  (5199, 106506)	0.03167401472267915
  (5199, 106520)	0.0322690701420

In [180]:
y_test = log_reg.predict(X_test)

In [183]:
y_test.shape

(5200,)

In [187]:
y_test_df = pd.DataFrame({'id': range(20800, len(y_test) + 20800),'label': y_test})

In [188]:
y_test_df

Unnamed: 0,id,label
0,20800,0
1,20801,1
2,20802,1
3,20803,0
4,20804,1
...,...,...
5195,25995,0
5196,25996,0
5197,25997,0
5198,25998,1


In [189]:
y_test_df.to_csv("y_sub.csv", index=False, encoding='utf-8')