# Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import os

# Get the Twitter data

In [2]:
# Url for the dataset
x_url = 'https://raw.githubusercontent.com/PratishMashankar/twitter-sentiment-analysis/refs/heads/master/data/Twitter_Data.csv'

# load the data into a pandas dataframe
df_x = pd.read_csv(x_url)

# Check the first few rows of the dataframe
df_x.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [3]:
# check the size of the dataframe
print("Size of the DataFrame : ",df_x.shape)

# check the columns of the dataframe
print("Columns of the DataFrame : ",df_x.columns)

Size of the DataFrame :  (162980, 2)
Columns of the DataFrame :  Index(['clean_text', 'category'], dtype='object')


In [4]:
# Check the basic information of the dataframe
df_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162980 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clean_text  162976 non-null  object 
 1   category    162973 non-null  float64
dtypes: float64(1), object(1)
memory usage: 2.5+ MB


In [5]:
# Check the data types of the columns
print("Data types of the columns : \n",df_x.dtypes)

Data types of the columns : 
 clean_text     object
category      float64
dtype: object


In [6]:
# Check the number of missing values in each column
print("Number of missing values in each column : \n",df_x.isnull().sum())

Number of missing values in each column : 
 clean_text    4
category      7
dtype: int64


In [7]:
# Remove all the rows with missing values
df_x = df_x.dropna().copy()

In [8]:
# Rename the columns of the reddit dataframe to match the twitter dataframe
df_x.rename(columns={'clean_text':'content','category':'label'}, inplace=True)

# Get the Reddit data

In [9]:
# Url for the dataset
r_url = 'https://raw.githubusercontent.com/Himanshu-1703/reddit-sentiment-analysis/refs/heads/main/data/reddit.csv'

# load the data into a pandas dataframe
df_redd = pd.read_csv(r_url)

# Check the first few rows of the dataframe
df_redd.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [10]:
# check the size of the dataframe
print("Size of the DataFrame : ",df_redd.shape)

# check the columns of the dataframe
print("Columns of the DataFrame : ",df_redd.columns)

Size of the DataFrame :  (37249, 2)
Columns of the DataFrame :  Index(['clean_comment', 'category'], dtype='object')


In [11]:
# Check the basic information of the dataframe
df_redd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37249 entries, 0 to 37248
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   clean_comment  37149 non-null  object
 1   category       37249 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 582.1+ KB


In [12]:
# Check the data types of the columns
print("Data types of the columns : \n",df_redd.dtypes)

Data types of the columns : 
 clean_comment    object
category          int64
dtype: object


In [13]:
# Check the number of missing values in each column
print("Number of missing values in each column : \n",df_redd.isnull().sum())

Number of missing values in each column : 
 clean_comment    100
category           0
dtype: int64


In [14]:
# Remove all the rows with missing values
df_redd = df_redd.dropna().copy()

In [15]:
# Rename the columns of the reddit dataframe to match the twitter dataframe
df_redd.rename(columns={'clean_comment':'content','category':'label'}, inplace=True)

# Concatinate this two dataframe into one dataframe

In [16]:
# Concatenate the two dataframes
final_df = pd.concat([df_x, df_redd], ignore_index=True)

In [17]:
# check the size of the dataframe
print("Size of the DataFrame : ",final_df.shape)

# check the columns of the dataframe
print("Columns of the DataFrame : ",final_df.columns)

Size of the DataFrame :  (200118, 2)
Columns of the DataFrame :  Index(['content', 'label'], dtype='object')


In [18]:
# Check the basic information of the dataframe
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200118 entries, 0 to 200117
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   content  200118 non-null  object 
 1   label    200118 non-null  float64
dtypes: float64(1), object(1)
memory usage: 3.1+ MB


In [19]:
# Check the data types of the columns
print("Data types of the columns : \n",final_df.dtypes)

Data types of the columns : 
 content     object
label      float64
dtype: object


In [20]:
# Check the number of missing values in each column
print("Number of missing values in each column : \n",final_df.isnull().sum())

Number of missing values in each column : 
 content    0
label      0
dtype: int64


In [21]:
# Check for the duplicates in the dataframe
print("Number of duplicates in the DataFrame : ",final_df.duplicated().sum())

Number of duplicates in the DataFrame :  411


In [22]:
# Remove the duplicates from the dataframe
final_df = final_df.drop_duplicates().copy()

In [23]:
# check the size of the dataframe   
print("Size of the DataFrame : ",final_df.shape)

Size of the DataFrame :  (199707, 2)


# Exporting the Final Dataframe to CSV

In [25]:
# Export the final dataframe to a csv file
final_df.to_csv('../data/raw/sentiment_data.csv', index=False)
print("Final dataframe exported to csv file successfully.")

Final dataframe exported to csv file successfully.
