In [1]:
# -------------------------------
# Imports
# -------------------------------
import os
import pandas as pd

In [2]:
# -------------------------------
# Loading the dataset
# -------------------------------
df = pd.read_csv(os.path.join(os.getcwd(), "..", "data", "raw", "deepseek_vs_chatgpt.csv"))

In [3]:
# -------------------------------
# Initial Data Inspection
# -------------------------------
print("Initial data preview:")
print(df.head(), "\n")

print("Dataframe Info:")
print(df.info(), "\n")

print("Dataframe Shape:")
print(df.shape, "\n")


Initial data preview:
         Date  Month_Num   Weekday AI_Platform   AI_Model_Version  \
0  2024-09-21          9  Saturday     ChatGPT        GPT-4-turbo   
1  2024-09-21          9  Saturday     ChatGPT        GPT-4-turbo   
2  2024-09-21          9  Saturday     ChatGPT        GPT-4-turbo   
3  2024-09-21          9  Saturday     ChatGPT        GPT-4-turbo   
4  2024-05-16          5  Thursday    DeepSeek  DeepSeek-Chat 1.5   

   Active_Users  New_Users  Churned_Users  Daily_Churn_Rate  Retention_Rate  \
0        500000      25000          25000              0.05            0.95   
1        500000      25000          25000              0.05            0.95   
2        500000      25000          25000              0.05            0.95   
3        500000      25000          25000              0.05            0.95   
4       1700000     170000          34000              0.02            0.95   

   ... Session_Duration_sec     Device_Type Language  Response_Accuracy  \
0  ...       

In [4]:
# -------------------------------
# Data Cleaning
# -------------------------------
duplicate_count = df.duplicated().sum()
print(f"Number of duplicate rows found: {duplicate_count}")
df = df.drop_duplicates()

print("Missing values per column before cleaning:")
print(df.isnull().sum(), "\n")

df = df.dropna()
print("Missing values per column after dropping rows with nulls:")
print(df.isnull().sum(), "\n")

df['Date'] = df['Date'].apply(lambda date_str: pd.to_datetime(date_str, errors='coerce').year)

df = df.drop(columns=['Weekday', 'AI_Model_Version', 'New_Users', 'Retention_Rate', 'User_ID', 'Input_Text'])

print("Dataframe shape after cleaning:")
print(df.shape, "\n")
df.head()

Number of duplicate rows found: 0
Missing values per column before cleaning:
Date                               0
Month_Num                          0
Weekday                            0
AI_Platform                        0
AI_Model_Version                   0
Active_Users                       0
New_Users                          0
Churned_Users                      0
Daily_Churn_Rate                   0
Retention_Rate                     0
User_ID                            0
Query_Type                         0
Input_Text                         0
Input_Text_Length                  0
Response_Tokens                    0
Topic_Category                     0
User_Rating                        0
User_Experience_Score              0
Session_Duration_sec               0
Device_Type                        0
Language                           0
Response_Accuracy                379
Response_Speed_sec                 0
Response_Time_Category             0
Correction_Needed                  

Unnamed: 0,Date,Month_Num,AI_Platform,Active_Users,Churned_Users,Daily_Churn_Rate,Query_Type,Input_Text_Length,Response_Tokens,Topic_Category,...,Session_Duration_sec,Device_Type,Language,Response_Accuracy,Response_Speed_sec,Response_Time_Category,Correction_Needed,User_Return_Frequency,Customer_Support_Interactions,Region
0,2024,9,ChatGPT,500000,25000,0.05,General,7,280,Professional Writing,...,40,Mobile,es,0.7842,3.3,Standard,0,6,2,Antarctica (the territory South of 60 deg S)
1,2024,9,ChatGPT,500000,25000,0.05,General,7,80,Content Creation,...,24,Laptop/Desktop,zh,0.8194,3.28,Standard,1,2,2,Ukraine
2,2024,9,ChatGPT,500000,25000,0.05,General,8,131,Best Practices,...,34,Mobile,en,0.809,3.07,Standard,0,2,0,Grenada
3,2024,9,ChatGPT,500000,25000,0.05,General,7,426,Content Creation,...,18,Mobile,fr,0.8233,3.06,Standard,0,9,0,Guyana
4,2024,5,DeepSeek,1700000,34000,0.02,Technical,6,215,Debugging,...,10,Mobile,de,0.9366,1.48,Fast,0,9,3,India


In [5]:
# -------------------------------
# Data Type Conversion for Specific Columns
# -------------------------------
numeric_cols = ['Response_Accuracy', 'Response_Speed_sec', 'User_Rating', 'User_Experience_Score']

for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert to numeric, set errors to NaN

# Drop rows where conversion to numeric has produced NaN values for the specified columns
df = df.dropna(subset=numeric_cols)
print("Data types after conversion:")
df.dtypes

Data types after conversion:


Date                               int64
Month_Num                          int64
AI_Platform                       object
Active_Users                       int64
Churned_Users                      int64
Daily_Churn_Rate                 float64
Query_Type                        object
Input_Text_Length                  int64
Response_Tokens                    int64
Topic_Category                    object
User_Rating                        int64
User_Experience_Score            float64
Session_Duration_sec               int64
Device_Type                       object
Language                          object
Response_Accuracy                float64
Response_Speed_sec               float64
Response_Time_Category            object
Correction_Needed                  int64
User_Return_Frequency              int64
Customer_Support_Interactions      int64
Region                            object
dtype: object

In [6]:
# -------------------------------
# Calculate Summary Statistics for Specific Numeric Columns
# -------------------------------
model_summary = df.groupby('AI_Platform')[numeric_cols].agg(['mean', 'std', 'min', 'max'])
print("Summary statistics by Model:")
model_summary

Summary statistics by Model:


Unnamed: 0_level_0,Response_Accuracy,Response_Accuracy,Response_Accuracy,Response_Accuracy,Response_Speed_sec,Response_Speed_sec,Response_Speed_sec,Response_Speed_sec,User_Rating,User_Rating,User_Rating,User_Rating,User_Experience_Score,User_Experience_Score,User_Experience_Score,User_Experience_Score
Unnamed: 0_level_1,mean,std,min,max,mean,std,min,max,mean,std,min,max,mean,std,min,max
AI_Platform,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
ChatGPT,0.802574,0.064646,0.6542,0.9467,3.442135,0.870924,1.82,5.19,3.994892,0.76971,3,5,1.229591,0.355456,0.48,1.95
DeepSeek,0.899684,0.040618,0.8055,0.9972,1.23476,0.447468,0.33,2.18,4.802412,0.398222,4,5,2.034912,0.180626,1.54,2.28


In [7]:
# -------------------------------
# Save Preprocessed Data
# -------------------------------'
preprocessed_data_path = os.path.join(os.getcwd(), "..", "data", "preprocessed", "preprocessed_data.csv")
df.to_csv(preprocessed_data_path, index=False)
print(f"Preprocessed data saved.")

deepseek_df = df[df['AI_Platform'] == 'DeepSeek']
deepseek_df.to_csv(os.path.join(os.getcwd(), "..", "data", "preprocessed", "deepseek_data.csv"), index=False)

chatgpt_df = df[df['AI_Platform'] == 'ChatGPT']
chatgpt_df.to_csv(os.path.join(os.getcwd(), "..", "data", "preprocessed", "chatgpt_data.csv"), index=False)
print("DeepSeek and ChatGPT data saved.")

Preprocessed data saved.
DeepSeek and ChatGPT data saved.
