In [None]:
import pandas as pd
import numpy as np

import re 
import random
from imblearn.over_sampling import RandomOverSampler
from scipy.sparse import hstack

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import joblib
import json


In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import warnings 
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('data.csv')

In [4]:
df.dropna(inplace = True)

In [5]:
status_counts = df['status'].value_counts()

In [6]:
random_statements = df.groupby('status')['statement'].apply(lambda x: x.sample(n=1).iloc[0])

In [7]:
df['num_of_characters'] = df['statement'].str.len()
df['num_of_sentences'] = df['statement'].apply(lambda x: len(nltk.sent_tokenize(x)))

description = df[['num_of_characters', 'num_of_sentences']].describe()

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\isakh/nltk_data'
    - 'c:\\Users\\isakh\\AppData\\Local\\Programs\\Python\\Python312\\nltk_data'
    - 'c:\\Users\\isakh\\AppData\\Local\\Programs\\Python\\Python312\\share\\nltk_data'
    - 'c:\\Users\\isakh\\AppData\\Local\\Programs\\Python\\Python312\\lib\\nltk_data'
    - 'C:\\Users\\isakh\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [None]:
df.rename(columns={'statement': 'original_statement'}, inplace=True)

In [None]:
df['statement']=df['original_statement'].str.lower()

In [None]:
def remove_patterns(text):

    '''
    This function removes all URLs, markdown-style links,
    handels, and punctuation/other special characters
    '''
    
    text = re.sub(r'http[s]?://\S+', '', text)

    text = re.sub(r'\[.*?\]\(.*?\)', '', text)

    text = re.sub(r'@\w+', '', text)

    text = re.sub(r'[^\w\s]', '', text)
    
    return text.strip()

df['statement'] = df['statement'].apply(remove_patterns)

In [None]:
df['tokens'] = df['statement'].apply(word_tokenize)

In [None]:
stemmer = PorterStemmer()

def stem_tokens(tokens):
    return ' '.join(stemmer.stem(str(token)) for token in tokens)

df['tokens_stemmed'] = df['tokens'].apply(stem_tokens)

In [None]:
X = df[['tokens_stemmed', 'num_of_characters', 'num_of_sentences']]
y = df['status']


In [None]:
lbl_enc = LabelEncoder()
#my = lbl_enc.fit_transform(y.values)

encoder = LabelEncoder()
encoded_values = encoder.fit_transform(y)

# Check what labels are encoded by
print("Unique classes:", encoder.classes_)  # Displays unique labels
print("Encoded values:", encoded_values)   # Displays the transformed values
print("Mapping:", dict(zip(encoder.classes_, range(len(encoder.classes_)))))  # Shows mapping


In [None]:
df.to_csv("cleaned_df.csv", index= False)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=50000)
X_train_tfidf = vectorizer.fit_transform(X_train['tokens_stemmed'])
X_test_tfidf = vectorizer.transform(X_test['tokens_stemmed'])

X_train_num = X_train[['num_of_characters', 'num_of_sentences']].values
X_test_num = X_test[['num_of_characters', 'num_of_sentences']].values

X_train_combined = hstack([X_train_tfidf, X_train_num])
X_test_combined = hstack([X_test_tfidf, X_test_num])

print('Number of feature words: ', len(vectorizer.get_feature_names_out()))

Number of feature words:  50002


In [None]:
X_train_combined.shape

(42144, 50004)

In [None]:
ros = RandomOverSampler(random_state=101)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train_combined, y_train)

In [None]:
model = XGBClassifier(learning_rate=0.2, max_depth=7, n_estimators=500, random_state=101, tree_method='gpu_hist', device='cuda')
model.fit(X_train_resampled, y_train_resampled)
prediction = model.predict(X_test_combined)
accuracy = accuracy_score(y_test, prediction)

In [None]:
print(accuracy)

0.8062067001992977


In [None]:
joblib.dump(model, "xgb_model.pkl")

['xgb_model.json']