In [1]:
import sys
sys.path.insert(0, '../../scripts/')

In [2]:
# import required packages
import pandas as pd

# preprocess
from sklearn.feature_extraction.text import TfidfVectorizer

# local scripts
from text_utils import preprocess_corpus

In [3]:
# load dataset into dataframe
df = pd.read_csv('../../data/train_data.csv')

# dataset shape: (rows, columns)
display(df.shape)

# first 5 datapoints
df.head()

(28614, 2)

Unnamed: 0,clean_tweet,cyberbullying_type
0,hate ppl high school used bully hot omg love m...,age
1,kat andre asshole omg mkr,not_cyberbullying
2,new access trading cause need high level opini...,age
3,fuck david duke racist think america belong du...,ethnicity
4,may say lot hate apologetic army hope choke ev...,other_cyberbullying


In [4]:
# preprocess documents
# replace null values with empty strings
clean_tweets = df['clean_tweet'].fillna('')

# remove special characters, stopwords
# lemmatization
# clean_headlines = preprocess_corpus(df.headline)

# clean_headlines.head()

In [5]:
# spawn a tfidf vectorizer
vectorizer = TfidfVectorizer(min_df=10)

# train and vectorize clean headlines
vectors = vectorizer.fit_transform(clean_tweets)

In [6]:
# extract tfidf vectors as dataframe
df_tfidf = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names_out())

# add clickbait column to TF-IDF vector data
df_tfidf['cyberbullying_type'] = df['cyberbullying_type']

# shape
display(df_tfidf.shape)

# first 5 datapoints
df_tfidf.head()

(28614, 3844)

Unnamed: 0,aalwuhaib,ability,able,abortion,absolute,absolutely,abt,abu,abuse,abused,...,yousufpoosuf,youth,youtube,ypg,yup,zaibatsunews,zappe,zero,zionist,cyberbullying_type
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,age
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,not_cyberbullying
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,age
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ethnicity
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,other_cyberbullying
