# Loading and Cleaning Data From CSV 

In [3]:
import pandas as pd

df = pd.read_csv('labeled_subreddit_posts.csv')
df = df.dropna()

print(df.head(20))


       subreddit                                              title  \
0      AskReddit                2024 united states elections thread   
1   ChangeMyView  meta: research collaboration opportunity with ...   
2   ChangeMyView  cmv: goodhearted "cultural appropriation" is f...   
3   ChangeMyView  cmv: anyone given a life sentence should also ...   
4   ChangeMyView  cmv: most people are too lazy to actually seek...   
5   ChangeMyView  cmv: it isn't hyperbolic for the west to be wo...   
6   ChangeMyView  cmv: the american healthcare system is not onl...   
7   ChangeMyView  cmv: the best way to tax corporations would be...   
8   ChangeMyView  cmv: the period of time when women were joking...   
9   ChangeMyView  cmv: the american led world system is not good...   
10  ChangeMyView       cmv:  the promise to gorbachev has no value.   
11  ChangeMyView  cmv: religion just a cycle of generational bra...   
12  ChangeMyView  cmv: you should be able to shoot to kill anyon...   
13  Ch

# Feature Engineering

In [None]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

le = LabelEncoder()
df['subreddit_label'] = le.fit_transform(df['subreddit_name'])

scaler = MinMaxScaler()
normalized_columns = ['upvote_ratio', 'comments_count', 'subscriber_count', 'engagement']
df[normalized_columns] = scaler.fit_transform(df[normalized_columns])

df['hour'] = pd.to_datetime(df['timestamp']).dt.hour
df['day_of_week'] = pd.to_datetime(df['timestamp']).dt.dayofweek

print(df[['subreddit_label', 'upvote_ratio', 'comments_count', 'subscriber_count', 'hour', 'day_of_week']].head())


   subreddit_label  upvote_ratio  comments_count  subscriber_count  hour  \
0                2      0.684211        0.443322          1.000000    17   
1                5      0.694737        0.000538          0.054069    10   
2                5      0.842105        0.004125          0.054069    19   
3                5      0.831579        0.002272          0.054069    19   
4                5      0.831579        0.003647          0.054069    12   

   day_of_week  
0            1  
1            2  
2            3  
3            3  
4            3  


# Processing Text Data Using TF-IDF


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine 'title' and 'content' for text analysis
df['combined_text'] = df['title'] + " " + df['content']

# Convert text to TF-IDF features
tfidf = TfidfVectorizer(max_features=500)  # Limit to 500 features for efficiency
text_features = tfidf.fit_transform(df['combined_text']).toarray()

print("Shape of text features:", text_features.shape)


Shape of text features: (10344, 500)


# Prepare Features and Target Variable
Combining all the processed features into a single matrix and extract the target variable.

In [6]:
import numpy as np

X = np.hstack([
    text_features,
    df[['subreddit_label', 'upvote_ratio', 'comments_count', 'subscriber_count', 'hour', 'day_of_week']].values
])

y = df['upvotes']

print("Feature matrix shape:", X.shape)
print("Target variable shape:", y.shape)


Feature matrix shape: (10344, 506)
Target variable shape: (10344,)


# Train-Test Split
Split the data into training and testing sets.

In [7]:
from sklearn.model_selection import train_test_split

# Split data: 70% training, 30% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")


Training set size: 7240
Testing set size: 3104


# Train Regression Model
Train a Random Forest Regressor Model to predict upvotes.

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

print("Model training complete!")

Model training complete!


# Evaluating the Model
Evaluating the performance of the model using Mean Squared Error (MSE) and R-squared.

In [9]:
from sklearn.metrics import mean_squared_error, r2_score

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


Mean Squared Error: 596277.0358574743
R-squared: 0.1623106901506015


# Predicting Upvotes for a New Post
Using the trained model to predict upvotes for a new Reddit post.

In [10]:
# Example new post data
new_post = {
    'title': "How to start with Data Science?",
    'content': "I am looking for beginner-friendly resources to learn Data Science.",
    'subreddit_name': "datascience",
    'upvote_ratio': 0.8,
    'comments_count': 15,
    'subscriber_count': 500000,
    'hour': 14,
    'day_of_week': 2
}

# Preprocess new post
new_combined_text = tfidf.transform([new_post['title'] + " " + new_post['content']]).toarray()
new_subreddit_label = le.transform([new_post['subreddit_name']])[0]

# Combine features
new_X = np.hstack([
    new_combined_text,
    np.array([[new_subreddit_label, new_post['upvote_ratio'], new_post['comments_count'], new_post['subscriber_count'], new_post['hour'], new_post['day_of_week']]])
])

# Predict upvotes
predicted_upvotes = model.predict(new_X)
print(f"Predicted Upvotes: {int(predicted_upvotes[0])}")


ValueError: y contains previously unseen labels: 'datascience'