In [1]:
!pip install pandasql
!pip install imbalanced-learn



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandasql import sqldf
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score

# Data Loading and initial cleaning

In [3]:
df = pd.read_csv('r_dataisbeautiful_posts.csv')
df.dropna(subset=['title'], inplace=True)
df.drop(['id', 'author_flair_text', 'awarders', 'created_utc', 'full_link'], axis=1, inplace=True)
df['total_awards_received'].fillna(0, inplace=True)
df['removed_by'].fillna('Not Deleted', inplace=True)
df['is_removed'] = df['removed_by'].apply(lambda x: 1 if x != 'Not Deleted' else 0)
df.drop(['removed_by', 'over_18'], axis=1, inplace=True)


  df = pd.read_csv('r_dataisbeautiful_posts.csv')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['total_awards_received'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['removed_by'].fillna('Not Deleted', inplace=True)


In [4]:
df.shape

(193090, 6)

In [5]:
df.describe()

Unnamed: 0,score,total_awards_received,num_comments,is_removed
count,193090.0,193090.0,193090.0,193090.0
mean,180.26209,0.001626,26.374639,0.081232
std,1946.312184,0.126901,206.887457,0.273191
min,0.0,0.0,0.0,0.0
25%,1.0,0.0,1.0,0.0
50%,1.0,0.0,1.0,0.0
75%,4.0,0.0,4.0,0.0
max,116226.0,30.0,18801.0,1.0


In [6]:
df.isna().sum()

title                    0
score                    0
author                   0
total_awards_received    0
num_comments             0
is_removed               0
dtype: int64

In [8]:
df.head()

Unnamed: 0,title,score,author,total_awards_received,num_comments,is_removed
0,[OC] The World’s Forests Mapped,1,vividmaps,0.0,1,0
1,[OC] Airbnb presence mapped in Barcelona (BCN)...,1,JonnieNeptune,0.0,0,0
2,A marketplace for open streaming data sources,1,DangerMouse289,0.0,0,1
3,[OC] Black Owner-Occupied Housing,1,SocialExplorerInc,0.0,1,0
4,[OC] My monthly average steps in 2020. Quarant...,1,ahmedgelemli,0.0,0,0


# Text Preprocessing

In [9]:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    text = text.lower()
    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return " ".join(stemmed_tokens)

df['cleaned_title'] = df['title'].apply(preprocess_text)


  text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)


# Feature Engineering

In [10]:
print("--- Creating Improved Features ---")
# a) TF-IDF for text features
vectorizer = TfidfVectorizer(max_features=5000)
X_text = vectorizer.fit_transform(df['cleaned_title'])

# b) Scale the numeric features
numeric_features = df[['score', 'num_comments', 'total_awards_received']]
scaler = StandardScaler()
X_numeric = scaler.fit_transform(numeric_features)

# c) Combine text and numeric features
# hstack is used to combine sparse (TF-IDF) and dense (numeric) matrices
X_combined = hstack([X_text, X_numeric])
y = df['is_removed']

print("Shape of combined feature matrix:", X_combined.shape)

--- Creating Improved Features ---
Shape of combined feature matrix: (193090, 5003)


# Model Training

In [11]:
print("\n--- Model Training with SMOTE ---")

# a) Train-Test Split (BEFORE applying SMOTE)
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42, stratify=y)

# b) Apply SMOTE to the training data only
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(f"Shape of training data before SMOTE: {X_train.shape}")
print(f"Shape of training data after SMOTE: {X_train_resampled.shape}")
print("\nTraining set class distribution after SMOTE:\n", y_train_resampled.value_counts())

# c) Train Random Forest Model on the resampled data
# We remove class_weight='balanced' now because SMOTE has already balanced the data
rf_classifier = RandomForestClassifier(random_state=42, n_estimators=100)
rf_classifier.fit(X_train_resampled, y_train_resampled)
y_pred_rf = rf_classifier.predict(X_test)

# d) Evaluate the improved model
print("\n--- Improved Random Forest Results ---")
print(classification_report(y_test, y_pred_rf))
print(f"Random Forest F1-Score: {f1_score(y_test, y_pred_rf):.2f}")


--- Model Training with SMOTE ---
Shape of training data before SMOTE: (154472, 5003)
Shape of training data after SMOTE: (283848, 5003)

Training set class distribution after SMOTE:
 is_removed
0    141924
1    141924
Name: count, dtype: int64

--- Improved Random Forest Results ---
              precision    recall  f1-score   support

           0       0.95      0.95      0.95     35481
           1       0.45      0.47      0.46      3137

    accuracy                           0.91     38618
   macro avg       0.70      0.71      0.71     38618
weighted avg       0.91      0.91      0.91     38618

Random Forest F1-Score: 0.46
