# Fake/Real News data analysis and classification
Ευάγγελος Δημητριάδης</br>
1115201700287

# Initialisation :

Give access to google drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Data file locations (Indicative paths)

In [None]:
from os import path

if path.exists('./gdrive'):
  true_data = './gdrive/MyDrive/Data/True.csv'
  fake_data = './gdrive/MyDrive/Data/Fake.csv'
  train_data = './gdrive/MyDrive/Data/train.csv'
  test_data = './gdrive/MyDrive/Data/test.csv'
elif path.exists('./drive'):
  true_data = './drive/MyDrive/Data/True.csv'
  fake_data = './drive/MyDrive/Data/Fake.csv'
  train_data = './drive/MyDrive/Data/train.csv'
  test_data = './drive/MyDrive/Data/test.csv'

Import Libraries

In [None]:
!pip install nltk==3.4
!pip install --upgrade gensim

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from gensim.parsing.preprocessing import remove_stopwords
from gensim.models import Word2Vec
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.util import ngrams
from nltk import word_tokenize
import numpy as np
import statistics
import re

In [None]:
pd.set_option('display.max_rows', None)

# Data analysis:

Create Dataframe

In [None]:
dft = pd.read_csv(true_data).dropna(axis='rows')
dff = pd.read_csv(fake_data).dropna(axis='rows')

### **1.** Visualization of fake and true news titles

In [None]:
def preprocessing(text):
  text=re.sub(r'[^\w\s]','',text)
  return text

Visualise most common subjects

In [None]:
#Real News
df1=pd.DataFrame()
df1['num_subjects'] = dft['subject'].value_counts()
df1.plot.bar()

plt.title("Real News most common subjects")
plt.show()

#Fake News
df2=pd.DataFrame()
df2['num_subjects'] = dff['subject'].value_counts()
df2.plot.bar()

plt.title("Fake News most common subjects")
plt.show()

Visualise most common words found in titles

In [None]:
#Real News
counts_df=dft['title'].apply(preprocessing).apply(lambda x : str.lower(x)).apply(remove_stopwords).str.split(expand=True).stack().value_counts()
counts_df=pd.DataFrame({'Unigram':counts_df.index, 'Appearances':counts_df.values})
counts_df=counts_df.sort_values(by=['Appearances'], ascending=False)
counts_df=counts_df[:20]
counts_df.index = range(1, len(counts_df)+1)
counts_df['Unigram']=counts_df['Unigram'].str.title()

d = {}
for a, x in counts_df.values:
    d[a] = x

wordcloud = WordCloud()
wordcloud.generate_from_frequencies(frequencies=d)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Real News most common title words")
plt.show()

In [None]:
#Fake News
counts_df=dff['title'].apply(preprocessing).apply(lambda x : str.lower(x)).apply(remove_stopwords).str.split(expand=True).stack().value_counts()
counts_df=pd.DataFrame({'Unigram':counts_df.index, 'Appearances':counts_df.values})
counts_df=counts_df.sort_values(by=['Appearances'], ascending=False)
counts_df=counts_df[:20]
counts_df.index = range(1, len(counts_df)+1)
counts_df['Unigram']=counts_df['Unigram'].str.title()

d = {}
for a, x in counts_df.values:
    d[a] = x

wordcloud = WordCloud()
wordcloud.generate_from_frequencies(frequencies=d)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Fake News most common title words")
plt.show()

### **2.** Average number of characters used

* Titles



In [None]:
#Real News
df1=pd.DataFrame()
df1['char_count'] = dft['title'].str.len()
num_items_t=len(dft.index)
num_characters_t=df1['char_count'].sum()

#Fake News
df2=pd.DataFrame()
df2['char_count'] = dff['title'].str.len()
num_items_f=len(dff.index)
num_characters_f=df2['char_count'].sum()

#Graph
data=[num_characters_t/num_items_t,num_characters_f/num_items_f]
avg=pd.DataFrame(data,columns = ['Average characters'])
avg.index = ['Real News','Fake News']
avg=avg.sort_values(by=['Average characters'], ascending=False)

avg.plot.bar()
plt.title('Average number of characters in title')
plt.show()

* Text

In [None]:
#Real News
df1=pd.DataFrame()
df1['char_count'] = dft['text'].str.len()
num_items_t=len(dft.index)
num_characters_t=df1['char_count'].sum()

#Fake News
df2=pd.DataFrame()
df2['char_count'] = dff['text'].str.len()
num_items_f=len(dff.index)
num_characters_f=df2['char_count'].sum()

#Graph
data=[num_characters_t/num_items_t,num_characters_f/num_items_f]
avg=pd.DataFrame(data,columns = ['Average characters'])
avg.index = ['Real News','Fake News']
avg=avg.sort_values(by=['Average characters'], ascending=False)

print('Text:')
avg.plot.bar()
plt.title('Average number of characters in text')
plt.show()

### **3.** Number of words distribution graph

* Titles

In [None]:
#Real News
df=pd.DataFrame()
df['num_words'] = dft['title'].str.split().apply(len)
df.hist()
plt.title("Real news titles No. words distribution")
plt.xlabel('No.Words')
plt.ylabel('No.Articles')
plt.show()

In [None]:
#Fake News
df=pd.DataFrame()
df['num_words'] = dff['title'].str.split().apply(len)
df.hist()
plt.title("Fake news titles No. words distribution")
plt.xlabel('No.Words')
plt.ylabel('No.Articles')
plt.show()

* Text

In [None]:
#Real News
df=pd.DataFrame()
df['num_words'] = dft['text'].str.split().apply(len)
df.hist()
plt.title("Real news text No. words distribution")
plt.xlabel('No.Words')
plt.ylabel('No.Articles')
plt.show()

In [None]:
#Fake News
df=pd.DataFrame()
df['num_words'] = dff['text'].str.split().apply(len)
df.hist()
plt.title("Fake news text No. words distribution")
plt.xlabel('No.Words')
plt.ylabel('No.Articles')
plt.show()

### **4.** Number of words distribution graph (stopwords removed)

In [None]:
def preprocessing(text):
  text=re.sub(r'[^\w\s]','',text)
  return text

* Titles

In [None]:
#Real News
dft_sw=pd.DataFrame()
dft_sw['title'] = dft['title'].apply(preprocessing).apply(lambda x : str.lower(x)).apply(remove_stopwords)

df=pd.DataFrame()
df['num_words'] = dft_sw['title'].str.split().apply(len)
df.hist()
plt.title("Real news titles No. words distribution (no stopwords)")
plt.show()

In [None]:
#Fake News
dff_sw=pd.DataFrame()
dff_sw['title'] = dff['title'].apply(preprocessing).apply(lambda x : str.lower(x)).apply(remove_stopwords)

df=pd.DataFrame()
df['num_words'] = dff_sw['title'].str.split().apply(len)
df.hist()
plt.title("Fake news titles No. words distribution (no stopwords)")
plt.show()

* Text

In [None]:
#Real News
dft_sw=pd.DataFrame()
dft_sw['text'] = dft['text'].apply(preprocessing).apply(lambda x : str.lower(x)).apply(remove_stopwords)

df=pd.DataFrame()
df['num_words'] = dft_sw['text'].str.split().apply(len)
df.hist()
plt.title("Real news text No. words distribution (no stopwords)")
plt.show()

In [None]:
#Fake News
dff_sw=pd.DataFrame()
dff_sw['text'] = dff['text'].apply(preprocessing).apply(lambda x : str.lower(x)).apply(remove_stopwords)

df=pd.DataFrame()
df['num_words'] = dff_sw['text'].str.split().apply(len)
df.hist()
plt.title("Fake news text No. words distribution (no stopwords)")
plt.show()

### **5.** Most usual bigrams

In [None]:
def get_ngrams(text):
    n_grams = ngrams(word_tokenize(text), 2)
    return [ ' '.join(grams) for grams in n_grams]

In [None]:
def preprocessing(text):
  text=re.sub(r'[^\w\s]','',text)
  text=re.sub(r'\b[a-zA-Z]\b','',text)
  return text

* Titles

In [None]:
#Real News
df=dft['title'].apply(preprocessing).apply(lambda x : str.lower(x)).apply(remove_stopwords).apply(lambda row: list(get_ngrams(row)))
df=pd.DataFrame({'Bigrams':df})
df2 = pd.DataFrame(df.Bigrams.tolist(), index= df.index)
df2=df2.stack().value_counts()

counts_df=pd.DataFrame({'Bigram':df2.index, 'Appearances':df2.values})
counts_df=counts_df.sort_values(by=['Appearances'], ascending=False)
counts_df=counts_df[:20]
counts_df.index = range(1, len(counts_df)+1)
counts_df['Bigram']=counts_df['Bigram'].str.title()

d = {}
for a, x in counts_df.values:
    d[a] = x

wordcloud = WordCloud()
wordcloud.generate_from_frequencies(frequencies=d)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Real News most common bigrams in titles")
plt.show()


In [None]:
#Fake News
df=dff['title'].apply(preprocessing).apply(lambda x : str.lower(x)).apply(remove_stopwords).apply(lambda row: list(get_ngrams(row)))
df=pd.DataFrame({'Bigrams':df})
df2 = pd.DataFrame(df.Bigrams.tolist(), index= df.index)
df2=df2.stack().value_counts()

counts_df=pd.DataFrame({'Bigram':df2.index, 'Appearances':df2.values})
counts_df=counts_df.sort_values(by=['Appearances'], ascending=False)
counts_df=counts_df[:20]
counts_df.index = range(1, len(counts_df)+1)
counts_df['Bigram']=counts_df['Bigram'].str.title()

d = {}
for a, x in counts_df.values:
    d[a] = x

wordcloud = WordCloud()
wordcloud.generate_from_frequencies(frequencies=d)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Fake News most common bigrams in titles")
plt.show()


* Text

In [None]:
#Real News
df=dft['text'].apply(preprocessing).apply(lambda x : str.lower(x)).apply(remove_stopwords).apply(lambda row: list(get_ngrams(row)))
df=pd.DataFrame({'Bigrams':df})
df2 = pd.DataFrame(df.Bigrams.tolist(), index= df.index)
df2=df2.stack().value_counts()

counts_df=pd.DataFrame({'Bigram':df2.index, 'Appearances':df2.values})
counts_df=counts_df.sort_values(by=['Appearances'], ascending=False)
counts_df=counts_df[:20]
counts_df.index = range(1, len(counts_df)+1)
counts_df['Bigram']=counts_df['Bigram'].str.title()

d = {}
for a, x in counts_df.values:
    d[a] = x

wordcloud = WordCloud()
wordcloud.generate_from_frequencies(frequencies=d)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Real News most common bigrams in text")
plt.show()


In [None]:
#Fake News
df=dff['text'].apply(preprocessing).apply(lambda x : str.lower(x)).apply(remove_stopwords).apply(lambda row: list(get_ngrams(row)))
df=pd.DataFrame({'Bigrams':df})
df2 = pd.DataFrame(df.Bigrams.tolist(), index= df.index)
df2=df2.stack().value_counts()

counts_df=pd.DataFrame({'Bigram':df2.index, 'Appearances':df2.values})
counts_df=counts_df.sort_values(by=['Appearances'], ascending=False)
counts_df=counts_df[:20]
counts_df.index = range(1, len(counts_df)+1)
counts_df['Bigram']=counts_df['Bigram'].str.title()

d = {}
for a, x in counts_df.values:
    d[a] = x

wordcloud = WordCloud()
wordcloud.generate_from_frequencies(frequencies=d)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Fake News most common bigrams in text")
plt.show()


# Classification dataset creation:

Create a csv file for training and one for testing.</br>
Train csv will have entries from both fake and true news files with an extra column showing if the entry is true or not (1 or 0 accordingly).</br>
The rest of the entries will be put on the test csv.

In [None]:
dft = pd.read_csv(true_data)
dff = pd.read_csv(fake_data)

###Data pre-processing/clean-up:

* Remove rows with null values
* Remove punctiation marks
* Remove digits
* Make words lowercase
* Remove stop words

In [None]:
def preprocessing(text):
  text=re.sub(r'[^\w\s]','',text)
  text=re.sub(r'\d+','',text)
  return text

dft['text']=dft['text'].apply(preprocessing).apply(lambda x : str.lower(x)).apply(remove_stopwords)
dft=dft.dropna(axis='rows')

dff['text']=dff['text'].apply(preprocessing).apply(lambda x : str.lower(x)).apply(remove_stopwords)
dff=dff.dropna(axis='rows')

### Training and testing Dataset Creation:

In [None]:
#Ratio of data from each file to be put on the train csv
ratio=1/8

dft["label"] = 1
dff["label"] = 0
dft = dft.sample(frac=1).reset_index(drop=True)
dff = dff.sample(frac=1).reset_index(drop=True)

df_trn=pd.DataFrame(columns=dft.columns)
df_tst=pd.DataFrame(columns=dft.columns)

cond = dft.index < round(len(dft.index)*ratio)
rows = dft.loc[cond, :]
df_trn = df_trn.append(rows, ignore_index=True)
dft.drop(rows.index, inplace=True)

cond =dff.index < round(len(dff.index)*ratio)
rows = dff.loc[cond, :]
df_trn = df_trn.append(rows, ignore_index=True)
dff.drop(rows.index, inplace=True)

df_tst=df_tst.append(dft,ignore_index=True)
df_tst=df_tst.append(dff,ignore_index=True)

df_trn = df_trn.sample(frac=1).reset_index(drop=True)
df_tst = df_tst.sample(frac=1).reset_index(drop=True)

df_trn.to_csv(train_data, index = False)
df_tst.to_csv(test_data, index=False)

# Classification implementation:

Classify articles as real or fake using:
*   Logistic Regression
*   Naive Bayes
*   Support Vector Machines
*   Random Forests

Evalutate each method using:
*   Accuracy
*   F1 Score

train.csv will be used for training the models and test.csv for testing their accuracy.




In [None]:
df_trn = pd.read_csv(train_data).dropna(axis='rows')
df_tst = pd.read_csv(test_data).dropna(axis='rows')

#Ratio of lines to be read from each file
#Read a portion to avoid running out of RAM (files are too big)
ratio=3/10
df_trn=df_trn[:round(len(df_trn.index)*ratio)]
df_tst=df_tst[:round(len(df_tst.index)*ratio)]

## Bag Of Words

In [None]:
bow_v = CountVectorizer(ngram_range=(1, 1),stop_words='english',max_df=1.0, min_df=1)
bow_x = bow_v.fit_transform(df_trn['text'])
features_trn=bow_x.toarray()

bow_x = bow_v.transform(df_tst['text'])
features_tst=bow_x.toarray()

trainlabel = np.asarray(df_trn['label'])
traindata = np.asarray(features_trn)
testlabel = np.asarray(df_tst['label'])
testdata = np.asarray(features_tst)

### Classifications:

In [None]:
#Logistic Regression
model = LogisticRegression(C = 20)
model.fit(traindata,trainlabel)

print("Logistic Regression")
accuracy = model.score(testdata, testlabel)
print("accuracy = ", accuracy * 100, "%")
lr_pred = model.predict(testdata)
f1_scr = f1_score(testlabel, lr_pred, average='weighted')
print("f1 score = ", f1_scr * 100, "%")

In [None]:
#Naive Bayes
model = GaussianNB()
model.fit(traindata,trainlabel)

print("Naive Bayes")
accuracy = model.score(testdata, testlabel)
print("accuracy = ", accuracy * 100, "%")
lr_pred = model.predict(testdata)
f1_scr = f1_score(testlabel, lr_pred, average='weighted')
print("f1 score = ", f1_scr * 100, "%")

In [None]:
#Support Vector Machines
#Using LinearSVC for better speed and accuracy
model = LinearSVC(C=100)
model.fit(traindata,trainlabel)

print("Support Vector Machines")
accuracy = model.score(testdata, testlabel)
print("accuracy = ", accuracy * 100, "%")
lr_pred = model.predict(testdata)
f1_scr = f1_score(testlabel, lr_pred, average='weighted')
print("f1 score = ", f1_scr * 100, "%")

In [None]:
#Random Forest
model = RandomForestClassifier(n_estimators=200)
model.fit(traindata,trainlabel)

print("Random Forests")
accuracy = model.score(testdata, testlabel)
print("accuracy = ", accuracy * 100, "%")
lr_pred = model.predict(testdata)
f1_scr = f1_score(testlabel, lr_pred, average='weighted')
print("f1 score = ", f1_scr * 100, "%")

## TF-IDF

In [None]:
tfidf_v = TfidfVectorizer(ngram_range=(1, 1),stop_words='english',max_df=1.0, min_df=1)
tfidf_x = tfidf_v.fit_transform(df_trn['text'])
features_trn=tfidf_x.toarray()

tfidf_x = tfidf_v.transform(df_tst['text'])
features_tst = tfidf_x.toarray()

trainlabel = np.asarray(df_trn['label'])
traindata = np.asarray(features_trn)
testlabel = np.asarray(df_tst['label'])
testdata = np.asarray(features_tst)

### Classifications:

In [None]:
#Logistic Regression
model = LogisticRegression(C = 20)
model.fit(traindata,trainlabel)

print("Logistic Regression")
accuracy = model.score(testdata, testlabel)
print("accuracy = ", accuracy * 100, "%")
lr_pred = model.predict(testdata)
f1_scr = f1_score(testlabel, lr_pred, average='weighted')
print("f1 score = ", f1_scr * 100, "%")

In [None]:
#Naive Bayes
model = GaussianNB()
model.fit(traindata,trainlabel)

print("Naive Bayes")
accuracy = model.score(testdata, testlabel)
print("accuracy = ", accuracy * 100, "%")
lr_pred = model.predict(testdata)
f1_scr = f1_score(testlabel, lr_pred, average='weighted')
print("f1 score = ", f1_scr * 100, "%")

In [None]:
#Support Vector Machines
#Using LinearSVC for better speed and accuracy
model = LinearSVC(C=100)
model.fit(traindata,trainlabel)

print("Support Vector Machines")
accuracy = model.score(testdata, testlabel)
print("accuracy = ", accuracy * 100, "%")
lr_pred = model.predict(testdata)
f1_scr = f1_score(testlabel, lr_pred, average='weighted')
print("f1 score = ", f1_scr * 100, "%")

In [None]:
#Random Forest
model = RandomForestClassifier(n_estimators=200)
model.fit(traindata,trainlabel)

print("Random Forests")
accuracy = model.score(testdata, testlabel)
print("accuracy = ", accuracy * 100, "%")
lr_pred = model.predict(testdata)
f1_scr = f1_score(testlabel, lr_pred, average='weighted')
print("f1 score = ", f1_scr * 100, "%")

## Word2Vec

In [None]:
#Train
#Word2Vec model
tokens = [sent.lower().split() for sent in df_trn['text']]

w2vmodel = Word2Vec(tokens,vector_size=100,seed=32,negative=5,sg=0,min_count=1,window=1)
w2vmodel.build_vocab(tokens)  # prepare the model vocabulary
w2vmodel.train(tokens, total_examples=w2vmodel.corpus_count, epochs=5); # train the model

vectors=list()
for sentence in df_trn['text']:
  words= sentence.lower().split()
  vectors.append([0] * 100)
  if len(words) > 0:
    for word in words:
      vectors[-1]+=w2vmodel.wv[word]
    vectors[-1]=[number / len(words) for number in vectors[-1]]

trainlabel = np.asarray(df_trn['label'])
traindata = np.asarray(vectors)

for idx, val in enumerate(traindata):
  if all(v == 0 for v in val):
     traindata = np.delete(traindata, idx, axis=0)
     trainlabel = np.delete(trainlabel, idx, axis=0)

#Test
#Word2Vec model
tokens_tst = [sent.lower().split() for sent in df_tst['text']]

w2vmodel_tst = Word2Vec(tokens_tst,vector_size=100,seed=32,negative=5,sg=0,min_count=1,window=1)
w2vmodel_tst.build_vocab(tokens_tst)  # prepare the model vocabulary
w2vmodel_tst.train(tokens_tst, total_examples=w2vmodel_tst.corpus_count, epochs=5); # train the model

vectors_tst=list()
for sentence in df_tst['text']:
  words= sentence.lower().split()
  vectors_tst.append([0] * 100)
  if len(words) > 0:
    for word in words:
        vectors_tst[-1]+=w2vmodel_tst.wv[word]
    vectors_tst[-1]=[number / len(words) for number in vectors_tst[-1]]

testlabel = np.asarray(df_tst['label'])
testdata = np.asarray(vectors_tst)

for idx, val in enumerate(testdata):
  if all(v == 0 for v in val):
     testdata = np.delete(testdata, idx, axis=0)
     testlabel = np.delete(testlabel, idx, axis=0)

### Classifications:

In [None]:
#Logistic Regression
model = LogisticRegression(C = 20)
model.fit(traindata,trainlabel)

print("Logistic Regression")
accuracy = model.score(testdata, testlabel)
print("accuracy = ", accuracy * 100, "%")
lr_pred = model.predict(testdata)
f1_scr = f1_score(testlabel, lr_pred, average='weighted')
print("f1 score = ", f1_scr * 100, "%")

In [None]:
#Naive Bayes
model = GaussianNB()
model.fit(traindata,trainlabel)

print("Naive Bayes")
accuracy = model.score(testdata, testlabel)
print("accuracy = ", accuracy * 100, "%")
lr_pred = model.predict(testdata)
f1_scr = f1_score(testlabel, lr_pred, average='weighted')
print("f1 score = ", f1_scr * 100, "%")

In [None]:
#Support Vector Machines
#Using LinearSVC for better speed and accuracy
model = LinearSVC(C=100)
model.fit(traindata,trainlabel)

print("Support Vector Machines")
accuracy = model.score(testdata, testlabel)
print("accuracy = ", accuracy * 100, "%")
lr_pred = model.predict(testdata)
f1_scr = f1_score(testlabel, lr_pred, average='weighted')
print("f1 score = ", f1_scr * 100, "%")

In [None]:
#Random Forest
model = RandomForestClassifier(n_estimators=200)
model.fit(traindata,trainlabel)

print("Random Forests")
accuracy = model.score(testdata, testlabel)
print("accuracy = ", accuracy * 100, "%")
lr_pred = model.predict(testdata)
f1_scr = f1_score(testlabel, lr_pred, average='weighted')
print("f1 score = ", f1_scr * 100, "%")

## Conclusions

Όπως ήταν αναμενόμενο τα classification με TF-IDF είναι κατά λίγο πιο ακριβή στις προβλέψεις από αυτά με BoW.</br>
Tα classifications με Word2Vec έχουν με διαφορά την χαμηλότερη ακρίβεια από τις τρεις μεθόδους.Αυτό γιατί χρησιμοποιήθηκαν embeddings που έγιναν trained ξεχωριστά και μόνο με τα κείμενα στο train_data και test_data αντίστοιχα.
Αν χρησιμοποιηθούν pre-trained embeddings η ακρίβεια εκτοξεύεται και έχει την δυνατότητα να ξεπεράσει τις άλλες δύο μεθόδους αναπαράστασης (ενδεικτικά με την χρήση glove-twitter-25 η ακρίβεια ανέβηκε στο ~86%).


# Optimization of classification:

TF-IDF has the best overall classification accuracy.</br>
We are going to try to optimize the classifications further by doing further preprocessing on the data.</br>
Preprocessing will consist of:
*   Lemmatizing words
*   Removing all special characters
*   Removing single characters


In [None]:
def preprocessing(text):
  text=re.sub(r'\W',' ',text)
  text=re.sub(r'\b[a-zA-Z]\b','',text)
  return text

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)])

df_trn = pd.read_csv(train_data).dropna(axis='rows')[:500]
df_tst = pd.read_csv(test_data).dropna(axis='rows')[:1000]

#Ratio of lines to be read from each file
#Read a portion to avoid running out of RAM (files are too big)
ratio=3/10
df_trn=df_trn[:round(len(df_trn.index)*ratio)]
df_tst=df_tst[:round(len(df_tst.index)*ratio)]

df_trn['text']=df_trn['text'].apply(preprocessing).apply(lemmatize_text)
df_trn['text']=df_trn['text'].dropna(axis='rows')
df_tst['text']=df_tst['text'].apply(preprocessing).apply(lemmatize_text)
df_tst['text']=df_tst['text'].dropna(axis='rows')

In [None]:
tfidf_v = TfidfVectorizer(ngram_range=(1, 1),stop_words='english',max_df=1.0, min_df=1)
tfidf_x = tfidf_v.fit_transform(df_trn['text'])
features_trn=tfidf_x.toarray()

tfidf_x = tfidf_v.transform(df_tst['text'])
features_tst = tfidf_x.toarray()

trainlabel = np.asarray(df_trn['label'])
traindata = np.asarray(features_trn)
testlabel = np.asarray(df_tst['label'])
testdata = np.asarray(features_tst)

### Classifications:

In [None]:
#Logistic Regression
model = LogisticRegression(C = 20)
model.fit(traindata,trainlabel)

print("Logistic Regression")
accuracy = model.score(testdata, testlabel)
print("accuracy = ", accuracy * 100, "%")
lr_pred = model.predict(testdata)
f1_scr = f1_score(testlabel, lr_pred, average='weighted')
print("f1 score = ", f1_scr * 100, "%")

In [None]:
#Naive Bayes
model = GaussianNB()
model.fit(traindata,trainlabel)

print("Naive Bayes")
accuracy = model.score(testdata, testlabel)
print("accuracy = ", accuracy * 100, "%")
lr_pred = model.predict(testdata)
f1_scr = f1_score(testlabel, lr_pred, average='weighted')
print("f1 score = ", f1_scr * 100, "%")

In [None]:
#Support Vector Machines
#Using LinearSVC for better speed and accuracy
model = LinearSVC(C=100)
model.fit(traindata,trainlabel)

print("Support Vector Machines")
accuracy = model.score(testdata, testlabel)
print("accuracy = ", accuracy * 100, "%")
lr_pred = model.predict(testdata)
f1_scr = f1_score(testlabel, lr_pred, average='weighted')
print("f1 score = ", f1_scr * 100, "%")

In [None]:
#Random Forest
model = RandomForestClassifier(n_estimators=200)
model.fit(traindata,trainlabel)

print("Random Forests")
accuracy = model.score(testdata, testlabel)
print("accuracy = ", accuracy * 100, "%")
lr_pred = model.predict(testdata)
f1_scr = f1_score(testlabel, lr_pred, average='weighted')
print("f1 score = ", f1_scr * 100, "%")

### Conclusions

Η ακρίβεια άλλαξε ελάχιστα.
Γενικά έχει γίνει ήδη πολύ προεπεξεργασία στα δεδομένα οπότε δεν υπάρχουν ιδιαίτερα περιθώρια βελτίωσης με περεταίρω προεπεξεργασία.