In [17]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /Users/sven/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/sven/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/sven/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/sven/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/sven/nltk_data...


In [2]:
print(nltk.data.path)

['/Users/sven/nltk_data', '/opt/anaconda3/nltk_data', '/opt/anaconda3/share/nltk_data', '/opt/anaconda3/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data']


In [4]:
# Load the data
data_file = "training_data_lowercase.csv"
data = pd.read_csv(data_file, sep="\t", header=None)  # Load as tab-delimited file without headers

In [5]:
# Assign new column names
data.columns = ['label', 'text']

data.head(5)

Unnamed: 0,label,text
0,0,donald trump sends out embarrassing new year‚s...
1,0,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...
3,0,trump is so obsessed he even has obama‚s name ...
4,0,pope francis just called out donald trump duri...


In [6]:
# Remove special characters
data['text'] = data['text'].str.replace(r'[^a-z0-9\s]', '', regex=True)

In [7]:
stop_words = set(stopwords.words('english'))

In [12]:
# Tokenize and remove stopwords for each text entry in the dataframe
data['filtered_text'] = data['text'].apply(lambda x: [word for word in word_tokenize(x) if word.lower() not in stop_words])

In [18]:
# Lemmatization of text to leverage the context as well

# from nltk.stem import WordNetLemmatizer
# from nltk.corpus import wordnet

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to get part-of-speech (POS) tagging for more accurate lemmatization
def get_wordnet_pos(word):
    from nltk.corpus import wordnet
    from nltk import pos_tag
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {
        'J': wordnet.ADJ,  # Adjective
        'N': wordnet.NOUN,  # Noun
        'V': wordnet.VERB,  # Verb
        'R': wordnet.ADV   # Adverb
    }
    return tag_dict.get(tag, wordnet.NOUN)  # Default to noun

# Apply lemmatization to the filtered_text column
data['lemmatized_text'] = data['filtered_text'].apply(
    lambda tokens: [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tokens]
)

# Display the first few rows to verify the result
print(data.head())



   label                                               text  \
0      0  donald trump sends out embarrassing new years ...   
1      0  drunk bragging trump staffer started russian c...   
2      0  sheriff david clarke becomes an internet joke ...   
3      0  trump is so obsessed he even has obamas name c...   
4      0  pope francis just called out donald trump duri...   

                                       filtered_text  \
0  [donald, trump, sends, embarrassing, new, year...   
1  [drunk, bragging, trump, staffer, started, rus...   
2  [sheriff, david, clarke, becomes, internet, jo...   
3  [trump, obsessed, even, obamas, name, coded, w...   
4  [pope, francis, called, donald, trump, christm...   

                                     lemmatized_text  
0  [donald, trump, sends, embarrass, new, year, e...  
1  [drunk, bragging, trump, staffer, start, russi...  
2  [sheriff, david, clarke, becomes, internet, jo...  
3  [trump, obsess, even, obamas, name, cod, websi...  
4  [pope,

In [24]:
# TF IDF

# Combine tokens into single text strings for each row in 'lemmatized_text'
data['lemmatized_text'] = data['lemmatized_text'].apply(lambda tokens: ' '.join(tokens))

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=1000,  # Limit to top 5000 words 
    stop_words='english',  # Exclude common stopwords 
    ngram_range=(1, 2)  # Consider unigrams and bigrams 
)

# Fit and transform the lemmatized text column
tfidf_matrix = tfidf_vectorizer.fit_transform(data['lemmatized_text'])

# Convert the resulting sparse matrix to a DataFrame for analysis
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())


ValueError: empty vocabulary; perhaps the documents only contain stop words

In [23]:
# Display the first few rows of the TF-IDF DataFrame
print(tfidf_df.head(50))

     10  10 year  100  100 day  1000   11   12   13   14   15  ...  zika  \
0   0.0      0.0  0.0      0.0   0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
1   0.0      0.0  0.0      0.0   0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
2   0.0      0.0  0.0      0.0   0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
3   0.0      0.0  0.0      0.0   0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
4   0.0      0.0  0.0      0.0   0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
5   0.0      0.0  0.0      0.0   0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
6   0.0      0.0  0.0      0.0   0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
7   0.0      0.0  0.0      0.0   0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
8   0.0      0.0  0.0      0.0   0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
9   0.0      0.0  0.0      0.0   0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
10  0.0      0.0  0.0      0.0   0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
11  0.0      0.0  0.0      0.0   0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
12  0.0     

In [25]:
print(data['lemmatized_text'].head(10))
print(data['lemmatized_text'].isna().sum())  # Check for NaN values
print(data['lemmatized_text'].apply(len).describe())  # Analyze lengths of text

0    d   o   n   a   l   d       t   r   u   m   p ...
1    d   r   u   n   k       b   r   a   g   g   i ...
2    s   h   e   r   i   f   f       d   a   v   i ...
3    t   r   u   m   p       o   b   s   e   s   s ...
4    p   o   p   e       f   r   a   n   c   i   s ...
5    r   a   c   i   s   t       a   l   a   b   a ...
6    f   r   e   s   h       g   o   l   f       c ...
7    t   r   u   m   p       s   a   y       i   n ...
8    f   o   r   m   e   r       c   i   a       d ...
9    b   r   a   n   d   n   e   w       p   r   o ...
Name: lemmatized_text, dtype: object
0
count    34152.000000
mean       233.234950
std         70.746604
min          0.000000
25%        193.000000
50%        225.000000
75%        265.000000
max        913.000000
Name: lemmatized_text, dtype: float64
