In [4]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("subreddit_posts.csv")

# Display the first few rows to verify
print(df.head())


   subreddit                                              title  \
0  AskReddit                2024 United States Elections Thread   
1  AskReddit  What is the worst atrocity committed in human ...   
2  AskReddit  If the World was told there was a 25% chance t...   
3  AskReddit          What instantly kills the vibe at a party?   
4  AskReddit  Whatâ€™s the most disgusting thing you've ever h...   

                                             content  upvotes  upvote_ratio  \
0  Please use this thread to discuss the ongoing ...      122          0.70   
1                                                NaN     4263          0.87   
2                                                NaN      717          0.92   
3                                                NaN      289          0.92   
4                                                NaN      620          0.86   

   comments_count               author            timestamp  post_id  
0            7415  AskRedditModerators  2024-11-0

In [5]:
df.columns

Index(['subreddit', 'title', 'content', 'upvotes', 'upvote_ratio',
       'comments_count', 'author', 'timestamp', 'post_id'],
      dtype='object')

In [6]:
df.shape

(18049, 9)

In [7]:
unique_subreddits_list = df['subreddit'].unique().tolist()
print(unique_subreddits_list)


['AskReddit', 'ChangeMyView', 'TodayILearned', 'self', 'offmychest', 'Showerthoughts', 'personalfinance', 'AskScience', 'Writing', 'Advice', 'LetsNotMeet', 'SelfImprovement', 'DecidingToBeBetter', 'AskHistorians', 'TwoXChromosomes', 'CasualConversation', 'InternetIsBeautiful', 'nosleep', 'WritingPrompts', 'ExplainLikeImFive', 'TrueOffMyChest', 'UnpopularOpinion', 'relationships', 'TrueAskReddit', 'Confession', 'ShortScaryStories', 'ProRevenge', 'NuclearRevenge', 'LifeProTips', 'needadvice', 'TrueUnpopularOpinion']


In [8]:
# Get the count of null values in each column and print with column names
null_values = df.isnull().sum()

# Display column names with their null count
for column, count in null_values.items():
    print(f"{column}: {count} null values")


subreddit: 0 null values
title: 0 null values
content: 3043 null values
upvotes: 0 null values
upvote_ratio: 0 null values
comments_count: 0 null values
author: 0 null values
timestamp: 0 null values
post_id: 0 null values


In [9]:
# Remove rows where 'content' column has null values
df = df.dropna(subset=['content'])




In [10]:
null_values = df.isnull().sum()

# Display column names with their null count
for column, count in null_values.items():
    print(f"{column}: {count} null values")

subreddit: 0 null values
title: 0 null values
content: 0 null values
upvotes: 0 null values
upvote_ratio: 0 null values
comments_count: 0 null values
author: 0 null values
timestamp: 0 null values
post_id: 0 null values


In [11]:
df.shape

(15006, 9)

In [12]:
import pandas as pd

# Assuming you already have your DataFrame 'df'
# Sample DataFrame with the given columns
# df = pd.read_csv("your_existing_file.csv")  # Replace with your actual DataFrame

# Convert 'timestamp' to datetime with automatic format inference
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')

# Extract Date, Time, and AM/PM
df['date'] = df['timestamp'].dt.date
df['time'] = df['timestamp'].dt.strftime('%I:%M')  # Time in the format "HH:MM AM/PM"
df['AM_PM'] = df['timestamp'].dt.strftime('%p')  # Extract AM or PM




In [13]:
df.shape

(15006, 12)

In [14]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Assuming your dataframe 'df' is already loaded with the necessary columns

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Select the columns to normalize
columns_to_normalize = ['upvote_ratio', 'comments_count', 'upvotes']

# Apply the scaler to the selected columns and create new columns
df[['upvote_ratio_normalized', 'comments_count_normalized', 'upvotes_normalized']] = scaler.fit_transform(df[columns_to_normalize])

# Display the updated DataFrame with the new normalized columns
print(df.head())


        subreddit                                              title  \
0       AskReddit                2024 United States Elections Thread   
858  ChangeMyView  Meta: Research Collaboration Opportunity with ...   
859  ChangeMyView  CMV: Goodhearted "cultural appropriation" is f...   
860  ChangeMyView  CMV: Anyone given a life sentence should also ...   
861  ChangeMyView  CMV: Most people are too lazy to actually seek...   

                                               content  upvotes  upvote_ratio  \
0    Please use this thread to discuss the ongoing ...      122          0.70   
858  From time to time, CMV will partner with profe...        7          0.71   
859  I am Austrian and when non-Austrians find a li...      132          0.85   
860  Anyone given a life sentence should also be gi...       45          0.84   
861  I always see people online talking about how t...      123          0.84   

     comments_count               author           timestamp  post_id  \
0      

In [15]:
# Drop the 'author', 'timestamp', and 'post_id' columns
# df = df.drop(columns=['author', 'timestamp', 'post_id'])



In [16]:
df['title'] = df['title'].str.replace(r'\W', ' ').str.lower()
df['content'] = df['content'].str.replace(r'\W', ' ').str.lower()


In [17]:
# from nltk.tokenize import word_tokenize
# df['title_tokens'] = df['title'].apply(word_tokenize)
# df['content_tokens'] = df['content'].apply(word_tokenize)


In [18]:
df.shape

(15006, 15)

In [19]:
df.to_csv('cleaned_labeled_subreddit_posts.csv', index=False)

In [20]:
# df = df[df['upvotes'] < df['upvotes'].quantile(0.95)]  # Removing top 5% of extreme values

In [21]:
# # Convert the entire DataFrame to JSON format and save it to a file
# df.to_json("data.json", orient='records', lines=True)

# # Notify the user that the file has been saved
# print("Data has been saved to 'data.json'")
