In [1]:
import os

# --- 1. DETECT LOCATION & SETUP PATHS ---
# If running inside 'notebooks', we save to the parent folder "../src"
if os.getcwd().endswith("notebooks"):
    print(f"üìÇ Detected execution inside '{os.getcwd()}'. Saving files to parent directory...")
    base_dir = "../src"
else:
    print(f"üìÇ Detected execution in root. Saving files to 'src/' directory...")
    base_dir = "src"

# Create the directory
os.makedirs(base_dir, exist_ok=True)

# --- FILE 1: __init__.py ---
init_code = """
from .etl import load_data
from .feature_eng import engineer_features
from .predictive_model import train_skip_model
from .clustering import cluster_listeners
"""
with open(os.path.join(base_dir, "__init__.py"), "w", encoding='utf-8') as f:
    f.write(init_code)
print(f"‚úÖ Created: {base_dir}/__init__.py")

# --- FILE 2: etl.py ---
etl_code = """
import pandas as pd
import json
import os
import glob

def load_data(raw_data_path):
    print(f"üìÇ ETL Started: Looking for data in {raw_data_path}...")
    files = glob.glob(os.path.join(raw_data_path, "*.json"))
    
    if not files:
        raise FileNotFoundError(f"‚ùå No JSON files found in {raw_data_path}. Please move your files there.")
    
    data_frames = []
    for file in files:
        with open(file, 'r', encoding='utf-8') as f:
            data = json.load(f)
            data_frames.append(pd.DataFrame(data))
            
    df = pd.concat(data_frames, ignore_index=True)
    
    if 'ts' in df.columns:
        df['ts'] = pd.to_datetime(df['ts'])
    
    df['master_metadata_track_name'] = df['master_metadata_track_name'].fillna('Unknown Track')
    df['master_metadata_album_artist_name'] = df['master_metadata_album_artist_name'].fillna('Unknown Artist')
    
    if 'ms_played' in df.columns and 'skipped' in df.columns:
        df = df[(df['ms_played'] > 10000) | (df['skipped'] == True)]
        
    print(f"‚úÖ ETL Complete. Processed {len(df)} rows.")
    return df
"""
with open(os.path.join(base_dir, "etl.py"), "w", encoding='utf-8') as f:
    f.write(etl_code)
print(f"‚úÖ Created: {base_dir}/etl.py")

# --- FILE 3: feature_eng.py ---
feat_code = """
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def engineer_features(df):
    print("‚öôÔ∏è Feature Engineering Started...")
    df = df.copy()
    
    df['hour'] = df['ts'].dt.hour
    df['day_of_week'] = df['ts'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
    
    if 'skipped' in df.columns:
        df['is_skipped'] = df['skipped'].astype(int)
    else:
        df['is_skipped'] = 0
        
    if 'reason_start' in df.columns:
        le = LabelEncoder()
        df['reason_start_encoded'] = le.fit_transform(df['reason_start'].astype(str))
        
    if 'shuffle' in df.columns:
        df['shuffle_feature'] = df['shuffle'].astype(int)
        
    return df
"""
with open(os.path.join(base_dir, "feature_eng.py"), "w", encoding='utf-8') as f:
    f.write(feat_code)
print(f"‚úÖ Created: {base_dir}/feature_eng.py")

# --- FILE 4: predictive_model.py ---
model_code = """
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

def train_skip_model(df):
    print("ü§ñ Training Predictive Model (Random Forest)...")
    
    features = ['ms_played', 'hour', 'day_of_week', 'is_weekend']
    if 'reason_start_encoded' in df.columns:
        features.append('reason_start_encoded')
    if 'shuffle_feature' in df.columns:
        features.append('shuffle_feature')
        
    X = df[features]
    y = df['is_skipped']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
    rf_model.fit(X_train, y_train)
    
    y_pred_proba = rf_model.predict_proba(X_test)[:, 1]
    auc_score = roc_auc_score(y_test, y_pred_proba)
    
    print(f"üéØ Model Results - AUC Score: {auc_score:.2f}")
    return rf_model, auc_score, features
"""
with open(os.path.join(base_dir, "predictive_model.py"), "w", encoding='utf-8') as f:
    f.write(model_code)
print(f"‚úÖ Created: {base_dir}/predictive_model.py")

# --- FILE 5: clustering.py ---
cluster_code = """
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

def cluster_listeners(df):
    print("‚ú® Running K-Means Clustering on Artists...")
    
    artist_stats = df.groupby('master_metadata_album_artist_name').agg({
        'ms_played': 'sum',
        'is_skipped': 'mean',
        'ts': 'count'
    }).reset_index()
    
    artist_stats.columns = ['Artist', 'Total_Ms', 'Skip_Rate', 'Play_Count']
    data = artist_stats[artist_stats['Play_Count'] > 20].copy()
    
    scaler = StandardScaler()
    X = scaler.fit_transform(data[['Skip_Rate', 'Play_Count']])
    
    kmeans = KMeans(n_clusters=3, random_state=42)
    data['Cluster'] = kmeans.fit_predict(X)
    
    print("‚úÖ Clustering Complete.")
    return data
"""
with open(os.path.join(base_dir, "clustering.py"), "w", encoding='utf-8') as f:
    f.write(cluster_code)
print(f"‚úÖ Created: {base_dir}/clustering.py")

print("\nüéâ SUCCESS! All Python files have been created.")

üìÇ Detected execution inside 'C:\Users\rahul\Spotify-Listening-Analysis\notebooks'. Saving files to parent directory...
‚úÖ Created: ../src/__init__.py
‚úÖ Created: ../src/etl.py
‚úÖ Created: ../src/feature_eng.py
‚úÖ Created: ../src/predictive_model.py
‚úÖ Created: ../src/clustering.py

üéâ SUCCESS! All Python files have been created.


In [None]:
import sys
import os
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Add the parent directory to the path so Python finds the 'src' folder
# (We go up one level from 'notebooks' to the main project folder)
sys.path.append(os.path.abspath('..'))

# 2. Import the modules you just created
from src.etl import load_data
from src.feature_eng import engineer_features
from src.predictive_model import train_skip_model
from src.clustering import cluster_listeners

# --- EXECUTION ---

try:
    # A. Load Data
    # Points to ../data/raw relative to the notebook
    raw_path = os.path.join('..', 'data', 'raw') 
    df = load_data(raw_path)
    
    # B. Feature Engineering
    df_features = engineer_features(df)
    
    # C. Run Predictive Model (Resume Validation)
    print("\n-----------------------------------")
    rf_model, auc, features = train_skip_model(df_features)
    print(f"‚úÖ RESUME PROOF: Model AUC Score is {auc:.2f}")
    print("-----------------------------------\n")

    # D. Run Clustering (Resume Validation)
    clustered_df = cluster_listeners(df_features)
    
    # --- VISUALIZATION (The "Improvement") ---
    
    # Graph 1: Feature Importance (Why do you skip songs?)
    plt.figure(figsize=(10, 5))
    importances = rf_model.feature_importances_
    indices = range(len(importances))
    plt.barh(indices, importances, align='center')
    plt.yticks(indices, features)
    plt.xlabel('Relative Importance')
    plt.title('Why do I skip songs? (Feature Importance)')
    plt.show()

    # Graph 2: Clusters
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=clustered_df, x='Play_Count', y='Skip_Rate', hue='Cluster', palette='viridis', s=100)
    plt.title('My Music Taste Clusters')
    plt.xlabel('Play Count')
    plt.ylabel('Skip Rate')
    plt.show()

except FileNotFoundError as e:
    print(f"\n‚ùå ERROR: {e}")
    print("Please make sure your .json files are inside the 'data/raw' folder!")
except Exception as e:
    print(f"\n‚ùå Something went wrong: {e}")