In [None]:
!pip install sdv

Collecting sdv
  Downloading sdv-1.30.0-py3-none-any.whl.metadata (14 kB)
Collecting boto3<2.0.0,>=1.28 (from sdv)
  Downloading boto3-1.42.4-py3-none-any.whl.metadata (6.8 kB)
Collecting botocore<2.0.0,>=1.31 (from sdv)
  Downloading botocore-1.42.4-py3-none-any.whl.metadata (5.9 kB)
Collecting copulas>=0.12.1 (from sdv)
  Downloading copulas-0.12.3-py3-none-any.whl.metadata (9.5 kB)
Collecting ctgan>=0.11.1 (from sdv)
  Downloading ctgan-0.11.1-py3-none-any.whl.metadata (10 kB)
Collecting deepecho>=0.7.0 (from sdv)
  Downloading deepecho-0.7.0-py3-none-any.whl.metadata (10 kB)
Collecting rdt>=1.18.2 (from sdv)
  Downloading rdt-1.18.2-py3-none-any.whl.metadata (10 kB)
Collecting sdmetrics>=0.21.0 (from sdv)
  Downloading sdmetrics-0.24.0-py3-none-any.whl.metadata (9.3 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3<2.0.0,>=1.28->sdv)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.17.0,>=0.16.0 (from boto3<2.0.0,>=1.28->sdv)
  Downloading s3t

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy.stats import ks_2samp
from scipy.stats import chisquare
from scipy.stats import chi2_contingency
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer

### Loading original cleaned dataset


In [None]:

df_original = pd.read_csv("apr_finaldata.csv")

print(df_original.shape)
df_original.head()

(8856, 11)


Unnamed: 0,product_id,discounted_price,actual_price,discount_percentage,rating,rating_count,user_id,user_name,review_id,review_title,review_content
0,B07JW9H4J1,399.0,1099.0,64.0,4.2,24269.0,AG3D6O4STAQKAY2UVGEUV46KN35Q,Manav,R3HXWT0LRP0NMF,Satisfied,Looks durable Charging is fine tooNo complains
1,B07JW9H4J1,399.0,1099.0,64.0,4.2,24269.0,AHMY5CWJMMK5BJRBBSNLYT3ONILA,Adarsh gupta,R2AJM3LFTLZHFO,Charging is really fast,Charging is really fast
2,B07JW9H4J1,399.0,1099.0,64.0,4.2,24269.0,AHCTC6ULH4XB6YHDY6PCH2R772LQ,Sundeep,R6AQJGUP6P86,Value for money,good product.
3,B07JW9H4J1,399.0,1099.0,64.0,4.2,24269.0,AGYHHIERNXKA6P5T7CZLXKVPT7IQ,S.Sayeed Ahmed,R1KD19VHEDV0OR,Product review,Till now satisfied with the quality.
4,B07JW9H4J1,399.0,1099.0,64.0,4.2,24269.0,AG4OGOFWXJZTQ2HKYIOCOY3KXF2Q,jaspreet singh,R3C02RMYQMK6FC,Good quality,This is a good product . The charging speed is...


In [None]:
# Select only certain columns that facilitates our purpose for sentimental amalysis and recommendation systems
columns_to_keep = ['product_id', 'user_id', 'rating', 'review_content']
df = df_original[columns_to_keep]

df.head()

Unnamed: 0,product_id,user_id,rating,review_content
0,B07JW9H4J1,AG3D6O4STAQKAY2UVGEUV46KN35Q,4.2,Looks durable Charging is fine tooNo complains
1,B07JW9H4J1,AHMY5CWJMMK5BJRBBSNLYT3ONILA,4.2,Charging is really fast
2,B07JW9H4J1,AHCTC6ULH4XB6YHDY6PCH2R772LQ,4.2,good product.
3,B07JW9H4J1,AGYHHIERNXKA6P5T7CZLXKVPT7IQ,4.2,Till now satisfied with the quality.
4,B07JW9H4J1,AG4OGOFWXJZTQ2HKYIOCOY3KXF2Q,4.2,This is a good product . The charging speed is...


In [None]:
df['rating'].unique()

array(['4.2', '4.0', '3.9', '4.1', '4.3', '4.4', '4.5', '3.7', '3.3',
       '3.6', '3.4', '3.8', '3.5', '4.6', '3.2', '5.0', '4.7', '3.0',
       '2.8', '4', '3.1', '4.8', '2.3', '|', '2', '3', '2.6', '2.9'],
      dtype=object)

In [None]:
df = df[df['rating'] != '|']


In [None]:
df['rating'].unique()

array(['4.2', '4.0', '3.9', '4.1', '4.3', '4.4', '4.5', '3.7', '3.3',
       '3.6', '3.4', '3.8', '3.5', '4.6', '3.2', '5.0', '4.7', '3.0',
       '2.8', '4', '3.1', '4.8', '2.3', '2', '3', '2.6', '2.9'],
      dtype=object)

In [None]:
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

In [None]:
df['rating'].dtype

dtype('float64')

In [None]:
df.columns
df.head()
df['rating'].value_counts(normalize=True) * 100


Unnamed: 0_level_0,proportion
rating,Unnamed: 1_level_1
4.1,16.951068
4.2,15.662787
4.3,15.583682
4.0,11.43632
3.9,8.543338
4.4,8.441632
3.8,6.328399
4.5,5.24353
3.7,2.825178
3.6,2.54266


**Inspect Rating Imbalance**

In [None]:
rating_dist = (
    df['rating']
    .value_counts(normalize=True)
    .sort_index()
    * 100
)

print("Original rating distribution (%):")
print(rating_dist)


Original rating distribution (%):
rating
2.0     0.022601
2.3     0.056504
2.6     0.079105
2.8     0.101706
2.9     0.079105
3.0     0.214714
3.1     0.226014
3.2     0.124308
3.3     0.949260
3.4     0.745847
3.5     1.988925
3.6     2.542660
3.7     2.825178
3.8     6.328399
3.9     8.543338
4.0    11.436320
4.1    16.951068
4.2    15.662787
4.3    15.583682
4.4     8.441632
4.5     5.243530
4.6     1.084868
4.7     0.418126
4.8     0.203413
5.0     0.146909
Name: proportion, dtype: float64


**Creating Bins for Balancing**

In [None]:
df['rating_bin'] = pd.cut(
    df['rating'],
    bins=[0, 2.5, 3.5, 4.5, 5.1],
    labels=['Low', 'Medium', 'High', 'Very_High'],
    include_lowest=True
)

print("Original rating_bin distribution (%):")
print(df['rating_bin'].value_counts(normalize=True) * 100)


Original rating_bin distribution (%):
rating_bin
High         93.558594
Medium        4.508984
Very_High     1.853317
Low           0.079105
Name: proportion, dtype: float64


**Oversample to Balance Rating Bins**

In [None]:
from sklearn.utils import resample

# Size per class
bin_sizes = df['rating_bin'].value_counts()
print("Original bin sizes:")
print(bin_sizes)

max_size = bin_sizes.max()
print("\nTarget size per bin:", max_size)

balanced_chunks = []

for rating_class, group_df in df.groupby('rating_bin'):
    if len(group_df) == 0:
        continue

    group_upsampled = resample(
        group_df,
        replace=True,
        n_samples=max_size,
        random_state=42
    )

    balanced_chunks.append(group_upsampled)

df_balanced = pd.concat(balanced_chunks, ignore_index=True)

print("\nBalanced shape:", df_balanced.shape)

print("\nBalanced rating_bin distribution:")
print(df_balanced['rating_bin'].value_counts(normalize=True) * 100)


Original bin sizes:
rating_bin
High         8279
Medium        399
Very_High     164
Low             7
Name: count, dtype: int64

Target size per bin: 8279

Balanced shape: (33116, 5)

Balanced rating_bin distribution:
rating_bin
Low          25.0
Medium       25.0
High         25.0
Very_High    25.0
Name: proportion, dtype: float64


  for rating_class, group_df in df.groupby('rating_bin'):


In [None]:
df_balanced.head()

Unnamed: 0,product_id,user_id,rating,review_content,rating_bin
0,B0BPJBTB3F,AE4Q5XQ7SZW35EEUJKQ3IV2IIBQQ,2.0,Best heater at this price. Quality is very goo...,Low
1,B0BFBNXS94,AE762UDUDQPW4R4QHHTIL7TPTJUA,2.3,Very very bad portable,Low
2,B0BFBNXS94,AEGZSJIUSKF2EKIKGLNKY2CU6WXA,2.3,First charge problemSecond motor proble,Low
3,B0BPJBTB3F,AE4Q5XQ7SZW35EEUJKQ3IV2IIBQQ,2.0,Best heater at this price. Quality is very goo...,Low
4,B0BFBNXS94,AHAVCLRCPYO2MFYPTURF33N7XH5A,2.3,cheap quality. doesn’t blend at all,Low


**Select Columns for CTGAN (Only IDs + Rating)**


In [None]:
df_ctgan = df_balanced[['product_id', 'user_id', 'rating', 'review_content']].copy()
df_ctgan.head()


Unnamed: 0,product_id,user_id,rating,review_content
0,B0BPJBTB3F,AE4Q5XQ7SZW35EEUJKQ3IV2IIBQQ,2.0,Best heater at this price. Quality is very goo...
1,B0BFBNXS94,AE762UDUDQPW4R4QHHTIL7TPTJUA,2.3,Very very bad portable
2,B0BFBNXS94,AEGZSJIUSKF2EKIKGLNKY2CU6WXA,2.3,First charge problemSecond motor proble
3,B0BPJBTB3F,AE4Q5XQ7SZW35EEUJKQ3IV2IIBQQ,2.0,Best heater at this price. Quality is very goo...
4,B0BFBNXS94,AHAVCLRCPYO2MFYPTURF33N7XH5A,2.3,cheap quality. doesn’t blend at all


In [None]:
df_ctgan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33116 entries, 0 to 33115
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   product_id      33116 non-null  object 
 1   user_id         33116 non-null  object 
 2   rating          33116 non-null  float64
 3   review_content  33116 non-null  object 
dtypes: float64(1), object(3)
memory usage: 1.0+ MB


### Metadata Definition for CTGAN Synthetic Data Generation


In [None]:
metadata = SingleTableMetadata()

metadata.add_column('product_id', sdtype='id')
metadata.add_column('user_id', sdtype='id')
metadata.add_column('rating', sdtype='numerical')
metadata.add_column('review_content', sdtype='text')


metadata.validate()
metadata.save_to_json("metadata_ctgan.json")
print("✅ Metadata saved as 'metadata_ctgan.json'.")


✅ Metadata saved as 'metadata_ctgan.json'.



In this step, **metadata is explicitly defined** for each column to guide the CTGAN model in correctly learning the data structure and variable types.

- `product_id` and `user_id` are defined as **identifier (`id`) types**, indicating unique entity keys.
- `rating` is defined as a **numerical variable**, allowing CTGAN to model continuous rating behavior.
- `review_content` is defined as a **text variable**, enabling natural language modeling (if used later).

After defining the schema:
- The metadata is **validated** to ensure consistency and correctness.
- The metadata configuration is **saved as a JSON file** (`metadata_ctgan.json`) for reuse in model training and reproducibility.




**Train CTGAN on Balanced Data**

In [None]:
from sdv.single_table import CTGANSynthesizer

ctgan_model = CTGANSynthesizer(
    metadata,
    epochs=20,          # you can reduce to 30 if still slow
    batch_size=120,     # ✅ multiple of 10 (pac), avoids AssertionError
    enable_gpu=True,   # ✅ no CUDA OOM, uses CPU
    verbose=True
)

print("⏳ Training CTGAN on balanced data...")
ctgan_model.fit(df_ctgan)
print("✅ CTGAN training completed.")


⏳ Training CTGAN on balanced data...


Gen. (2.58) | Discrim. (-0.01): 100%|██████████| 20/20 [08:56<00:00, 26.85s/it]

✅ CTGAN training completed.





Proper tuning of batch size and epochs ensures:
- Stable GAN convergence,
- Prevention of runtime assertion errors,
- Efficient utilization of compute resources.


**Generate 50× Synthetic Data**

In [None]:
original_n = len(df)              # original (before balancing)
factor = 50
target_rows = original_n * factor

print("Original rows:", original_n)
print("🎯 Target synthetic rows:", target_rows)

# ----- SAFE CHUNKED SAMPLING -----
chunk_size = 5000   # adjust if needed; 5k is safe for text columns
remaining = target_rows
synthetic_parts = []

while remaining > 0:
    n = min(chunk_size, remaining)
    print(f"Sampling {n} rows... Remaining after this: {remaining - n}")

    batch = ctgan_model.sample(num_rows=n)
    synthetic_parts.append(batch)

    remaining -= n

# Combine all batches
synthetic_ctgan = pd.concat(synthetic_parts, ignore_index=True)

print("Synthetic shape:", synthetic_ctgan.shape)
synthetic_ctgan.head()

Original rows: 8849
🎯 Target synthetic rows: 442450
Sampling 5000 rows... Remaining after this: 437450
Sampling 5000 rows... Remaining after this: 432450
Sampling 5000 rows... Remaining after this: 427450
Sampling 5000 rows... Remaining after this: 422450
Sampling 5000 rows... Remaining after this: 417450
Sampling 5000 rows... Remaining after this: 412450
Sampling 5000 rows... Remaining after this: 407450
Sampling 5000 rows... Remaining after this: 402450
Sampling 5000 rows... Remaining after this: 397450
Sampling 5000 rows... Remaining after this: 392450
Sampling 5000 rows... Remaining after this: 387450
Sampling 5000 rows... Remaining after this: 382450
Sampling 5000 rows... Remaining after this: 377450
Sampling 5000 rows... Remaining after this: 372450
Sampling 5000 rows... Remaining after this: 367450
Sampling 5000 rows... Remaining after this: 362450
Sampling 5000 rows... Remaining after this: 357450
Sampling 5000 rows... Remaining after this: 352450
Sampling 5000 rows... Remainin

Unnamed: 0,product_id,user_id,rating,review_content
0,B08BCKN299,AGNSJO5LAO5FKSHPW3UDKZQLPONQ,4.2,AAAAA
1,B06XDKWLJH,AF3OBVMLY5I6X3IFX2DKIFEYMGNA,4.0,AAAAB
2,B08FB2LNSZ,AGK67PKY5YNSHMUNIPVHWPQKPBLA,3.9,AAAAC
3,B0B9JZW1SQ,AGYWGDEV2VA2GXFIOTTX545DCUMQ,4.1,AAAAD
4,B00LUGTJGO,AG2KSOZBBZY3A37U4Q273OYH2IAQ,4.3,AAAAE


 The original dataset size (`original_n`) is used as the baseline.
- A **scaling factor of 50** is applied to define the target number of synthetic rows:
  - `target_rows = original_n × 50`
- To avoid **memory overflow issues**, the synthetic data is generated in **fixed-size chunks**:
  - `chunk_size = 5000` rows per batch,
  - Sampling continues iteratively until the full target size is reached.
- Each generated batch is temporarily stored and then **concatenated into a single DataFrame** (`synthetic_ctgan`).

This **chunked sampling strategy** is especially important when:
- Working with **large synthetic targets**,
- Handling **text columns** such as `review_content`,
- Operating under **GPU/CPU memory constraints**.

**Save Synthetic Data**

In [None]:
synthetic_ctgan.to_csv("synthetic_ctgan_rating_balanced_50x.csv", index=False)

from google.colab import files
files.download("synthetic_ctgan_rating_balanced_50x.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

A portable, reusable **CTGAN-generated synthetic dataset file** ready for modeling.