In [17]:
import os
import re
import pandas as pd

sql_folder = './sql'  # Folder containing your .sql files
output_folder = './csv'  # Folder to save extracted CSVs
os.makedirs(output_folder, exist_ok=True)

In [18]:
def extract_rows_from_sql(sql_text):
    insert_statements = re.findall(
        r"INSERT INTO [`\"]?([\w_]+)[`\"]?.*?VALUES\s*(.*?);", 
        sql_text, re.DOTALL | re.IGNORECASE
    )
    extracted_data = {}
    for table_name, values_block in insert_statements:
        rows = re.findall(r"\((.*?)\)", values_block, re.DOTALL)
        parsed_rows = []
        for row in rows:
            # Split by comma but ignore commas inside quotes
            values = re.split(r",(?=(?:[^']*'[^']*')*[^']*$)", row)
            cleaned = [v.strip().strip("'").strip('"') for v in values]
            parsed_rows.append(cleaned)
        if table_name in extracted_data:
            extracted_data[table_name].extend(parsed_rows)
        else:
            extracted_data[table_name] = parsed_rows
    return extracted_data

In [19]:
for filename in sorted(os.listdir(sql_folder)):
    if filename.endswith(".sql"):
        path = os.path.join(sql_folder, filename)
        with open(path, 'r', encoding='utf-8') as file:
            content = file.read()
        tables = extract_rows_from_sql(content)
        if not tables:
            print(f"⚠️ No valid data found in {filename}")
        for table, rows in tables.items():
            df = pd.DataFrame(rows)
            csv_path = os.path.join(output_folder, f"{table}.csv")
            df.to_csv(csv_path, index=False, header=False)
            print(f"✅ Extracted {len(rows)} rows from {filename} → {table}.csv")

⚠️ No valid data found in 01_mysql_create.sql
✅ Extracted 9235 rows from 02_mysql_populate_author.sql → author.csv
✅ Extracted 568 rows from 03_mysql_populate_publisher.sql → publisher.csv
✅ Extracted 27 rows from 04_mysql_populate_lookups.sql → book_language.csv
✅ Extracted 4 rows from 04_mysql_populate_lookups.sql → shipping_method.csv
✅ Extracted 2 rows from 04_mysql_populate_lookups.sql → address_status.csv
✅ Extracted 6 rows from 04_mysql_populate_lookups.sql → order_status.csv
✅ Extracted 37 rows from 05_mysql_populate_book.sql → book.csv
✅ Extracted 17642 rows from 06_mysql_populate_bookauthor.sql → book_author.csv
✅ Extracted 232 rows from 07_mysql_populate_country.sql → country.csv
✅ Extracted 1000 rows from 08_mysql_populate_address.sql → address.csv
✅ Extracted 2000 rows from 09_mysql_populate_customer.sql → customer.csv
⚠️ No valid data found in 10_mysql_populate_others.sql
⚠️ No valid data found in 11_mysql_populate_order.sql
⚠️ No valid data found in 12_mysql_populate_ord

In [9]:

import os
import re
import pandas as pd
import random
from datetime import datetime, timedelta

sql_folder = './sql'
output_folder = './csv_output'
os.makedirs(output_folder, exist_ok=True)

In [11]:
def extract_rows_from_sql(sql_text):
    insert_statements = re.findall(
        r'INSERT INTO [`"]?([\w_]+)[`"]?.*?VALUES\s*(.*?);',
        sql_text, re.DOTALL | re.IGNORECASE
    )
    extracted_data = {}
    for table_name, values_block in insert_statements:
        rows = re.findall(r"\((.*?)\)", values_block, re.DOTALL)
        parsed_rows = []
        for row in rows:
            values = re.split(r",(?=(?:[^']*'[^']*')*[^']*$)", row)
            cleaned = [v.strip().strip("'").strip('"') for v in values]
            parsed_rows.append(cleaned)
        if table_name in extracted_data:
            extracted_data[table_name].extend(parsed_rows)
        else:
            extracted_data[table_name] = parsed_rows
    return extracted_data

In [12]:

def generate_customer_address():
    customer_ids = list(range(1, 2001))
    address_ids = random.choices(range(1, 1001), k=2000)
    status_ids = [1] * 2000

    customer_ids += random.choices(range(1, 2001), k=750)
    address_ids += random.choices(range(1, 1001), k=750)
    status_ids += [1] * 750

    customer_ids += random.choices(range(1, 2001), k=400)
    address_ids += random.choices(range(1, 1001), k=400)
    status_ids += [2] * 400

    customer_ids += random.choices(range(1, 2001), k=200)
    address_ids += random.choices(range(1, 1001), k=200)
    status_ids += [1] * 200

    df = pd.DataFrame({
        'customer_id': customer_ids,
        'address_id': address_ids,
        'status_id': status_ids
    })
    df.to_csv(f"{output_folder}/customer_address.csv", index=False)
    print("✅ customer_address.csv created")
generate_customer_address()

✅ customer_address.csv created


In [13]:

def generate_cust_order():
    num_orders = 7550
    data = {
        "order_date": [datetime.now() - timedelta(days=random.randint(0, 1095)) for _ in range(num_orders)],
        "customer_id": [random.randint(1, 2000) for _ in range(num_orders)],
        "shipping_method_id": [random.randint(1, 4) for _ in range(num_orders)],
        "dest_address_id": [random.randint(1, 1000) for _ in range(num_orders)],
    }
    df = pd.DataFrame(data)
    df["order_date"] = df["order_date"].dt.strftime('%Y-%m-%d %H:%M:%S')
    df.to_csv(f"{output_folder}/cust_order.csv", index=False)
    print("✅ cust_order.csv created")
generate_cust_order()

✅ cust_order.csv created


In [14]:

def generate_order_line():
    record_count = 4000 + 2000 + 1000 + 300 + 500 + 50
    order_ids = [random.randint(1000, 9999) for _ in range(record_count)]
    book_ids = [random.randint(1, 11126) for _ in range(record_count)]
    prices = [round(random.uniform(0, 20), 2) for _ in range(record_count)]

    df = pd.DataFrame({
        'order_id': order_ids,
        'book_id': book_ids,
        'price': prices
    })
    df.to_csv(f"{output_folder}/order_line.csv", index=False)
    print("✅ order_line.csv created")
generate_order_line()

✅ order_line.csv created


In [15]:

def generate_order_history():
    record_count = 7547 + 6800 + 4000 + 3500 + 300 + 200
    order_ids = list(range(1001, 1001 + record_count))
    status_ids = [1]*7547 + [2]*6800 + [3]*4000 + [4]*3500 + [5]*300 + [6]*200
    random.shuffle(status_ids)
    status_dates = [datetime.now() - timedelta(days=random.randint(1, 30)) for _ in range(len(status_ids))]

    df = pd.DataFrame({
        'order_id': random.choices(order_ids, k=len(status_ids)),
        'status_id': status_ids,
        'status_date': [d.strftime('%Y-%m-%d %H:%M:%S') for d in status_dates]
    })
    df.to_csv(f"{output_folder}/order_history.csv", index=False)
    print("✅ order_history.csv created")
generate_order_history()

✅ order_history.csv created


In [16]:
print('🎉 All SQL files processed and missing ones generated as CSV.')

🎉 All SQL files processed and missing ones generated as CSV.


In [20]:

import pandas as pd
import os

# Load CSVs and check structure
folder_path = '.'
files = [f for f in os.listdir(folder_path) if f.endswith('.csv') and not f.startswith('cleaned_')]

for file in sorted(files):
    print(f"Loading: {file}")
    df = pd.read_csv(file)
    print(f"\n{'='*40}\nDATAFRAME: {file.upper().replace('.CSV','')}\nShape: {df.shape}")
    print("Missing values:\n", df.isnull().sum())

Loading: address.csv

DATAFRAME: ADDRESS
Shape: (999, 5)
Missing values:
 1                      0
57                     0
Glacier Hill Avenue    0
Torbat-e Jām           0
95                     0
dtype: int64
Loading: address_status.csv

DATAFRAME: ADDRESS_STATUS
Shape: (1, 2)
Missing values:
 1         0
Active    0
dtype: int64
Loading: author.csv

DATAFRAME: AUTHOR
Shape: (9234, 2)
Missing values:
 A. Bartlett Giamatti    0
1                       2
dtype: int64
Loading: book.csv

DATAFRAME: BOOK
Shape: (36, 7)
Missing values:
 The World''s First Love: Mary  Mother of God     0
1                                               10
8987059752                                      10
2                                               10
276                                             10
1996-09-01                                      10
1010                                            10
dtype: int64
Loading: book_author.csv

DATAFRAME: BOOK_AUTHOR
Shape: (17641, 2)
Missing values:
 1570  

In [21]:

files = [f for f in os.listdir('.') if f.endswith('.csv') and not f.startswith('cleaned_')]

for file in sorted(files):
    print(f"Cleaning: {file}")
    df = pd.read_csv(file)
    df_cleaned = df.dropna()
    df_cleaned.to_csv(f"cleaned_{file}", index=False)

print("✅ Cleaned files saved with 'cleaned_' prefix.")

Cleaning: address.csv
Cleaning: address_status.csv
Cleaning: author.csv
Cleaning: book.csv
Cleaning: book_author.csv
Cleaning: book_language.csv
Cleaning: country.csv
Cleaning: cust_order.csv
Cleaning: customer.csv
Cleaning: customer_address.csv
Cleaning: order_history.csv
Cleaning: order_line.csv
Cleaning: order_status.csv
Cleaning: publisher.csv
Cleaning: shipping_method.csv
✅ Cleaned files saved with 'cleaned_' prefix.


In [22]:

from datetime import timedelta

cust_order = pd.read_csv("cleaned_cust_order.csv")
cust_order["order_date"] = pd.to_datetime(cust_order["order_date"])

# Create churn labels
cutoff = cust_order["order_date"].max() - timedelta(days=180)
last_order = cust_order.groupby("customer_id")["order_date"].max().reset_index()
last_order["churn"] = (last_order["order_date"] < cutoff).astype(int)

# Aggregate features
features = cust_order.groupby("customer_id").agg({
    "order_date": "count",
    "shipping_method_id": "nunique",
    "dest_address_id": "nunique"
}).reset_index()
features.columns = ["customer_id", "order_count", "shipping_variation", "address_count"]

# Merge and normalize
df = pd.merge(features, last_order[["customer_id", "churn"]], on="customer_id")
for col in ["order_count", "shipping_variation", "address_count"]:
    df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min() + 1e-9)

df.to_csv("churn_model_data.csv", index=False)
df.head()

Unnamed: 0,customer_id,order_count,shipping_variation,address_count,churn
0,1,0.25,0.666667,0.25,0
1,2,0.416667,0.666667,0.416667,0
2,4,0.166667,0.333333,0.166667,1
3,5,0.166667,0.333333,0.166667,1
4,6,0.666667,0.666667,0.666667,0


In [23]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import pickle

# Load dataset
df = pd.read_csv("churn_model_data.csv")
X = df[["order_count", "shipping_variation", "address_count"]]
y = df["churn"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize models
models = {
    "LogisticRegression": LogisticRegression(),
    "RandomForest": RandomForestClassifier(),
    "DecisionTree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier()
}

# Train and evaluate
best_model = None
best_score = 0

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    score = accuracy_score(y_test, preds)
    print(f"{name} Accuracy: {score:.4f}")
    if score > best_score:
        best_score = score
        best_model = model
        best_model_name = name

# Save the best model
with open("best_churn_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

print(f"✅ Best Model: {best_model_name} with accuracy {best_score:.4f}")

LogisticRegression Accuracy: 0.6497
RandomForest Accuracy: 0.6497
DecisionTree Accuracy: 0.6514
KNN Accuracy: 0.6054
✅ Best Model: DecisionTree with accuracy 0.6514


In [24]:

import pickle
model = pickle.load(open("best_churn_model.pkl", "rb"))
df["predicted_churn"] = model.predict(X)
df.to_csv("churn_prediction_results.csv", index=False)
df.head(10)

Unnamed: 0,customer_id,order_count,shipping_variation,address_count,churn,predicted_churn
0,1,0.25,0.666667,0.25,0,0
1,2,0.416667,0.666667,0.416667,0,0
2,4,0.166667,0.333333,0.166667,1,1
3,5,0.166667,0.333333,0.166667,1,1
4,6,0.666667,0.666667,0.666667,0,0
5,7,0.083333,0.333333,0.083333,0,1
6,8,0.333333,0.333333,0.333333,1,0
7,9,0.5,0.666667,0.5,1,0
8,10,0.25,0.666667,0.25,0,0
9,11,0.666667,1.0,0.666667,0,0


In [25]:

df = pd.read_csv("churn_prediction_results.csv")
df.head()

Unnamed: 0,customer_id,order_count,shipping_variation,address_count,churn,predicted_churn
0,1,0.25,0.666667,0.25,0,0
1,2,0.416667,0.666667,0.416667,0,0
2,4,0.166667,0.333333,0.166667,1,1
3,5,0.166667,0.333333,0.166667,1,1
4,6,0.666667,0.666667,0.666667,0,0
