In [1]:
import pandas as pd
import numpy as np
import re


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

# NLTK
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mehak\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mehak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\mehak\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mehak\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
print(nltk.data.path)

['C:\\Users\\mehak/nltk_data', 'c:\\ProgramData\\anaconda3\\nltk_data', 'c:\\ProgramData\\anaconda3\\share\\nltk_data', 'c:\\ProgramData\\anaconda3\\lib\\nltk_data', 'C:\\Users\\mehak\\AppData\\Roaming\\nltk_data', 'C:\\nltk_data', 'D:\\nltk_data', 'E:\\nltk_data']


In [3]:
# Load the dataset (update file path accordingly)
data_file = "training_data_lowercase.csv"
data = pd.read_csv(data_file, sep="\t", header=None)  # Tab-separated

In [4]:
data.columns = ['label', 'text'] 

data.head(5)

Unnamed: 0,label,text
0,0,donald trump sends out embarrassing new year‚s...
1,0,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...
3,0,trump is so obsessed he even has obama‚s name ...
4,0,pope francis just called out donald trump duri...


In [5]:
data['text'] = data['text'].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)

In [6]:
stop_words = set(stopwords.words('english'))

In [7]:
# Tokenize and remove stopwords
data['filtered_text'] = data['text'].apply(lambda x: [word for word in word_tokenize(x.lower()) if word not in stop_words])

In [8]:
# Lemmatization of text to leverage the context as well

# from nltk.stem import WordNetLemmatizer
# from nltk.corpus import wordnet

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to get part-of-speech (POS) tagging for more accurate lemmatization
def get_wordnet_pos(word):
    from nltk.corpus import wordnet
    from nltk import pos_tag
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {
        'J': wordnet.ADJ,  # Adjective
        'N': wordnet.NOUN,  # Noun
        'V': wordnet.VERB,  # Verb
        'R': wordnet.ADV   # Adverb
    }
    return tag_dict.get(tag, wordnet.NOUN)  # Default to noun

# Apply lemmatization to the filtered_text column
data['lemmatized_text'] = data['filtered_text'].apply(
    lambda tokens: [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tokens]
)

# Display the first few rows to verify the result
print(data.head())


   label                                               text  \
0      0  donald trump sends out embarrassing new years ...   
1      0  drunk bragging trump staffer started russian c...   
2      0  sheriff david clarke becomes an internet joke ...   
3      0  trump is so obsessed he even has obamas name c...   
4      0  pope francis just called out donald trump duri...   

                                       filtered_text  \
0  [donald, trump, sends, embarrassing, new, year...   
1  [drunk, bragging, trump, staffer, started, rus...   
2  [sheriff, david, clarke, becomes, internet, jo...   
3  [trump, obsessed, even, obamas, name, coded, w...   
4  [pope, francis, called, donald, trump, christm...   

                                     lemmatized_text  
0  [donald, trump, sends, embarrass, new, year, e...  
1  [drunk, bragging, trump, staffer, start, russi...  
2  [sheriff, david, clarke, becomes, internet, jo...  
3  [trump, obsess, even, obamas, name, cod, websi...  
4  [pope,

In [9]:
# Combine tokens into single text strings for each row in 'lemmatized_text'
data['lemmatized_text'] = data['lemmatized_text'].apply(lambda tokens: ' '.join(tokens))

In [10]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=100,  # Adjust as needed
    stop_words='english',  # Ignore common stopwords
    ngram_range=(1, 2)  # Unigrams and bigrams
)

print(data['lemmatized_text'])

0        donald trump sends embarrass new year eve mess...
1        drunk bragging trump staffer start russian col...
2        sheriff david clarke becomes internet joke thr...
3          trump obsess even obamas name cod website image
4          pope francis call donald trump christmas speech
                               ...                        
34147              tear rain thai gather late king funeral
34148    pyongyang university need nonus teacher travel...
34149    philippine president duterte visit japan ahead...
34150             japan abe may election many dont want pm
34151      demoralize divide inside catalonia police force
Name: lemmatized_text, Length: 34152, dtype: object


In [11]:
# Fit and transform the lemmatized text column
tfidf_matrix = tfidf_vectorizer.fit_transform(data['lemmatized_text'])

print(data['lemmatized_text'])

0        donald trump sends embarrass new year eve mess...
1        drunk bragging trump staffer start russian col...
2        sheriff david clarke becomes internet joke thr...
3          trump obsess even obamas name cod website image
4          pope francis call donald trump christmas speech
                               ...                        
34147              tear rain thai gather late king funeral
34148    pyongyang university need nonus teacher travel...
34149    philippine president duterte visit japan ahead...
34150             japan abe may election many dont want pm
34151      demoralize divide inside catalonia police force
Name: lemmatized_text, Length: 34152, dtype: object


In [13]:
# Convert the resulting sparse matrix to a DataFrame for analysis
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(), 
    columns=tfidf_vectorizer.get_feature_names_out() )

In [14]:
# Display the first few rows of the TF-IDF DataFrame
print(tfidf_df.head(50))

    america  american    attack  ban     black    break  campaign  chief  \
0       0.0       0.0  0.000000  0.0  0.000000  0.00000  0.000000    0.0   
1       0.0       0.0  0.000000  0.0  0.000000  0.00000  0.000000    0.0   
2       0.0       0.0  0.000000  0.0  0.000000  0.00000  0.000000    0.0   
3       0.0       0.0  0.000000  0.0  0.000000  0.00000  0.000000    0.0   
4       0.0       0.0  0.000000  0.0  0.000000  0.00000  0.000000    0.0   
5       0.0       0.0  0.000000  0.0  0.677261  0.00000  0.000000    0.0   
6       0.0       0.0  0.000000  0.0  0.000000  0.00000  0.000000    0.0   
7       0.0       0.0  0.000000  0.0  0.000000  0.00000  0.000000    0.0   
8       0.0       0.0  0.000000  0.0  0.000000  0.00000  0.000000    0.0   
9       0.0       0.0  0.000000  0.0  0.000000  0.00000  0.000000    0.0   
10      0.0       0.0  0.000000  0.0  0.000000  0.00000  0.000000    0.0   
11      0.0       0.0  0.000000  0.0  0.000000  0.00000  0.000000    0.0   
12      0.0 

In [15]:
print(data['lemmatized_text'].head(10))
print(data['lemmatized_text'].isna().sum())  # Check for NaN values
print(data['lemmatized_text'].apply(len).describe())  # Analyze lengths of text

0    donald trump sends embarrass new year eve mess...
1    drunk bragging trump staffer start russian col...
2    sheriff david clarke becomes internet joke thr...
3      trump obsess even obamas name cod website image
4      pope francis call donald trump christmas speech
5    racist alabama cop brutalize black boy handcuf...
6                                    fresh golf course
7    trump say insanely racist stuff inside oval of...
8           former cia director slam trump un bullying
9     brandnew protrump ad feature much kiss make sick
Name: lemmatized_text, dtype: object
0
count    34152.000000
mean        59.058562
std         17.687234
min          0.000000
25%         49.000000
50%         57.000000
75%         67.000000
max        229.000000
Name: lemmatized_text, dtype: float64


In [16]:
X = tfidf_df  # Feature set
y = data['label']  # Labels

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Train Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# Predict and evaluate
y_pred_lr = log_reg.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.8003220611916264
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.77      0.80      3529
           1       0.77      0.83      0.80      3302

    accuracy                           0.80      6831
   macro avg       0.80      0.80      0.80      6831
weighted avg       0.80      0.80      0.80      6831



In [18]:
# Train Decision Tree Classifier
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train, y_train)

# Predict and evaluate
y_pred_tree = tree_clf.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_tree))
print("Classification Report:\n", classification_report(y_test, y_pred_tree))

Decision Tree Accuracy: 0.8028107158541942
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.74      0.80      3529
           1       0.76      0.87      0.81      3302

    accuracy                           0.80      6831
   macro avg       0.81      0.80      0.80      6831
weighted avg       0.81      0.80      0.80      6831



In [19]:
print("Number of NaN values:", data['lemmatized_text'].isna().sum())

Number of NaN values: 0


In [20]:
print(data['lemmatized_text'].apply(len).describe())

count    34152.000000
mean        59.058562
std         17.687234
min          0.000000
25%         49.000000
50%         57.000000
75%         67.000000
max        229.000000
Name: lemmatized_text, dtype: float64


In [21]:
print("TF-IDF Features:\n", tfidf_df.head())

TF-IDF Features:
    america  american  attack  ban  black  break  campaign  chief  china  \
0      0.0       0.0     0.0  0.0    0.0    0.0       0.0    0.0    0.0   
1      0.0       0.0     0.0  0.0    0.0    0.0       0.0    0.0    0.0   
2      0.0       0.0     0.0  0.0    0.0    0.0       0.0    0.0    0.0   
3      0.0       0.0     0.0  0.0    0.0    0.0       0.0    0.0    0.0   
4      0.0       0.0     0.0  0.0    0.0    0.0       0.0    0.0    0.0   

   clinton  ...  video  vote  voter  want  watch  white  white house  win  \
0      0.0  ...    0.0   0.0    0.0   0.0    0.0    0.0          0.0  0.0   
1      0.0  ...    0.0   0.0    0.0   0.0    0.0    0.0          0.0  0.0   
2      0.0  ...    0.0   0.0    0.0   0.0    0.0    0.0          0.0  0.0   
3      0.0  ...    0.0   0.0    0.0   0.0    0.0    0.0          0.0  0.0   
4      0.0  ...    0.0   0.0    0.0   0.0    0.0    0.0          0.0  0.0   

   woman      year  
0    0.0  0.553761  
1    0.0  0.000000  
2    