In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Set visual style for plots
sns.set_theme(style="whitegrid")

In [13]:
df = pd.read_csv("social_media_engagement_dataset.csv")
print(df.info())

<class 'pandas.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   platform         10000 non-null  str    
 1   post_type        10000 non-null  str    
 2   post_length      10000 non-null  int64  
 3   views            10000 non-null  int64  
 4   likes            10000 non-null  int64  
 5   comments         10000 non-null  int64  
 6   shares           10000 non-null  int64  
 7   engagement_rate  10000 non-null  float64
dtypes: float64(1), int64(5), str(2)
memory usage: 625.1 KB
None


In [14]:
# Check missing values
print(df.isnull().sum())

# Fill numeric columns with mean
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())

# Fill categorical columns with mode
categorical_columns = df.select_dtypes(include=['object', 'string']).columns
for col in categorical_columns:
    df[col] = df[col].fillna(df[col].mode()[0])

# Verify missing values handled
print(df.isnull().sum())


platform           0
post_type          0
post_length        0
views              0
likes              0
comments           0
shares             0
engagement_rate    0
dtype: int64
platform           0
post_type          0
post_length        0
views              0
likes              0
comments           0
shares             0
engagement_rate    0
dtype: int64


In [15]:
# Use one-hot encoding for categorical variables
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)


# Check the new dataframe
print(df_encoded.head())


   post_length   views  likes  comments  shares  engagement_rate  \
0           62   91660   2968       276     346         0.039166   
1          104  113115   4164       632     406         0.045989   
2           46   36043   3125       188     100         0.094692   
3           39  124886   5970       948     578         0.060023   
4           42   82831   8212      1104     334         0.116502   

   platform_Instagram  platform_Twitter  post_type_Text  post_type_Video  
0               False             False            True            False  
1                True             False           False             True  
2               False             False           False             True  
3               False             False           False            False  
4                True             False           False             True  
