In [3]:
import polars as pl 
import polars_ds as pds
import spacy 
from spacy.lang.en.stop_words import STOP_WORDS
# To get the english dictionary
import nltk
from nltk.corpus import words,stopwords
import os 
import string
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer 

import warnings
warnings.filterwarnings('ignore')

In [20]:
try:
    nlp = spacy.load("en_core_web_sm")
except:
    print("Downloading spaCy NLP model...")
    print("This may take a few minutes and it's one time process...")
    os.system(
        "pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl")
    nlp = spacy.load("en_core_web_sm")

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Yafee
[nltk_data]     Ishraq\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Read the data 

In [3]:
data = pl.read_csv('train.csv')

# Null values 

There are very few null values in keyword column. Location contains 2533 null values. Later I will be looking at a word cloud to see if there are any patterns in the text columns.

In [4]:
data.null_count()

id,keyword,location,text,target
u32,u32,u32,u32,u32
0,61,2533,0,0


# Text cleaning

## Simple cleaning operations

In [5]:
# Remove leading and trailing characters
data = data.with_columns(pl.col(pl.String).str.strip_chars())

# To lowercase
data = data.with_columns(pl.col(pl.String).str.to_lowercase())

# Remove stopwords. Stopwords from the nltk library
stop_words = stopwords.words('english')

# Making expressions for all string columns. The pl.all() method is not returning the expected outp
stop_word_expression = [pl.col(c).str.split(" ").list.set_difference(stop_words).list.join(" ").name.keep() for c in data.select(pl.col(pl.String)).columns]

# Apply expressions
data = data.with_columns(*stop_word_expression)


### Defining regex

In [6]:
import re

# Regex for removing http url 
regex_http_url = r'http[s]?://\S+'

# Regex for punctuations
regex_pattern_punctuations = '[' + re.escape(string.punctuation) + ']'

# Regex for special characters
regex_pattern_special_characters = r'^[^A-Za-z0-9]*$'

# Regex for numbers
regex_pattern_numbers = r'\d'

# Remove ascii
regex_pattern_ascii = r'[^\p{Ascii}]'

# Regex for 2 or more whitespace
regex_whitespace = r'\s{2,}'


### Run text cleaning

In [7]:

data = data.with_columns(pl.col('location')
                  .str.replace_all(regex_http_url," ")
                  .str.replace_all(regex_pattern_punctuations," ")
                  .str.replace_all(regex_pattern_special_characters," ")
                  .str.replace_all(regex_pattern_numbers," ")
                  .str.replace_all(regex_pattern_ascii," ")
                  .str.replace_all(regex_whitespace," ")
                  .str.strip_chars()
                  .alias('cleaned_location'),

                  pl.col('text')
                  .str.replace_all(regex_http_url," ") 
                  .str.replace_all(regex_pattern_punctuations," ")
                  .str.replace_all(regex_pattern_special_characters," ")
                  .str.replace_all(regex_pattern_numbers," ")
                  .str.replace_all(regex_pattern_ascii," ")
                  .str.replace_all(regex_whitespace," ")
                  .str.strip_chars()
                  .alias('cleaned_text'),

                  pl.col('keyword')
                  .str.replace_all(regex_http_url," ")
                  .str.replace_all(regex_pattern_punctuations," ")
                  .str.replace_all(regex_pattern_special_characters," ")
                  .str.replace_all(regex_pattern_numbers," ")
                  .str.replace_all(regex_pattern_ascii," ")
                  .str.replace_all(regex_whitespace," ")
                  .str.strip_chars()
                  .alias('cleaned_keyword')
)

In [17]:
# Initialize
tfidf_vectorizer = TfidfVectorizer()
tfidf_vector = tfidf_vectorizer.fit_transform(data['cleaned_text'])

In [41]:
# To polars
tf_idf_df = pl.from_numpy(tfidf_vector.toarray(),schema=list(tfidf_vectorizer.get_feature_names_out()))

# Add target variable 
tf_idf_df = tf_idf_df.hstack(data.select(pl.col('target').alias('target_ml')))


In [42]:
X = tf_idf_df.select(pl.all().exclude('target_ml'))
y = tf_idf_df.select('target_ml')

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [48]:
rf = RandomForestClassifier(n_estimators = 100)  
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)

  return fit_method(estimator, *args, **kwargs)


In [52]:
from sklearn import metrics 
print("ACCURACY OF THE MODEL:", metrics.accuracy_score(y_test, y_pred))

ACCURACY OF THE MODEL: 0.7655942219304005


In [57]:
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
y_pred = xgb.predict(X_test)

In [58]:
print("ACCURACY OF THE MODEL:", metrics.accuracy_score(y_test, y_pred))

ACCURACY OF THE MODEL: 0.7701904136572554
