In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

def clean_customer_data(df):
    """Clean and prepare customer data"""
 
    df.columns = ['CustomerID', 'CustomerName', 'Region', 'SignupDate']

    df['Region'] = df['Region'].str.strip()

    df['SignupDate'] = pd.to_datetime(df['SignupDate'])
    
    return df

def clean_product_data(df):
    """Clean and prepare product data"""

    df.columns = ['ProductID', 'ProductName', 'Category', 'Price']
    

    df['Category'] = df['Category'].str.strip()
    
    return df

def load_and_prepare_data(customers_file, products_file):
    """Load and prepare the data for the lookalike model"""

    customers = pd.read_csv(r"C:\Users\rahul\Downloads\Data_Science\Customers.csv")
    products = pd.read_csv(r"C:\Users\rahul\Downloads\Data_Science\Products.csv")
    

    customers = clean_customer_data(customers)
    products = clean_product_data(products)

    region_dummies = pd.get_dummies(customers['Region'], prefix='Region')
    

    n_categories = len(products['Category'].unique())
    category_preferences = pd.DataFrame(
        np.random.dirichlet(np.ones(n_categories), size=len(customers)),
        columns=products['Category'].unique(),
        index=customers.index
    )

    feature_df = pd.concat([
        customers[['CustomerID']],
        region_dummies,
        category_preferences
    ], axis=1)
    
    return feature_df

def create_lookalike_model(feature_df):
    """Create and return the lookalike model"""
    # Separate CustomerID and features
    customer_ids = feature_df['CustomerID']
    features = feature_df.drop('CustomerID', axis=1)

    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features)
    

    similarity_matrix = cosine_similarity(scaled_features)
    
    return customer_ids, similarity_matrix

def get_top_lookalikes(customer_id, customer_ids, similarity_matrix, n=3):
    """Get top n lookalike customers for a given customer ID"""

    customer_idx = customer_ids[customer_ids == customer_id].index[0]
    

    similarities = similarity_matrix[customer_idx]
    

    similar_indices = np.argsort(similarities)[::-1][1:n+1]
    

    results = []
    for idx in similar_indices:
        results.append({
            'customer_id': customer_ids.iloc[idx],
            'similarity_score': similarities[idx]
        })
    
    return results

def generate_lookalike_recommendations():
    """Generate lookalike recommendations for customers C0001-C0020"""

    feature_df = load_and_prepare_data('Customers.csv', 'Products.csv')
    customer_ids, similarity_matrix = create_lookalike_model(feature_df)
    

    recommendations = []
    for customer_id in customer_ids[customer_ids.str.match('C00[0-1][0-9]')]:
        lookalikes = get_top_lookalikes(customer_id, customer_ids, similarity_matrix)
        row = {
            'CustomerID': customer_id,
            'Lookalike1_ID': lookalikes[0]['customer_id'],
            'Lookalike1_Score': round(lookalikes[0]['similarity_score'], 4),
            'Lookalike2_ID': lookalikes[1]['customer_id'],
            'Lookalike2_Score': round(lookalikes[1]['similarity_score'], 4),
            'Lookalike3_ID': lookalikes[2]['customer_id'],
            'Lookalike3_Score': round(lookalikes[2]['similarity_score'], 4)
        }
        recommendations.append(row)
    

    result_df = pd.DataFrame(recommendations)
    result_df.to_csv('FirstName_LastName_Lookalike.csv', index=False)
    return result_df
recommendations = generate_lookalike_recommendations()
print(recommendations)
recommendations.to_csv(r"C:\Users\rahul\Downloads\Data_Science\Rahul_Gurram_Lookalike.csv", index=False)

   CustomerID Lookalike1_ID  Lookalike1_Score Lookalike2_ID  Lookalike2_Score  \
0       C0001         C0096            0.9785         C0113            0.9169   
1       C0002         C0088            0.9924         C0078            0.9631   
2       C0003         C0012            0.9832         C0155            0.9729   
3       C0004         C0108            0.9026         C0099            0.8606   
4       C0005         C0128            0.9983         C0021            0.9479   
5       C0006         C0011            0.9597         C0095            0.9048   
6       C0007         C0162            0.9507         C0093            0.9484   
7       C0008         C0038            0.9744         C0030            0.9733   
8       C0009         C0069            0.9550         C0057            0.8876   
9       C0010         C0100            0.9741         C0069            0.9128   
10      C0011         C0188            0.9653         C0006            0.9597   
11      C0012         C0155 

In [5]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import spearmanr


def clean_customer_data(df):
    """Clean and prepare customer data"""
    df.columns = ['CustomerID', 'CustomerName', 'Region', 'SignupDate']
    df['Region'] = df['Region'].str.strip()
    df['SignupDate'] = pd.to_datetime(df['SignupDate'])
    return df

def clean_product_data(df):
    """Clean and prepare product data"""
    df.columns = ['ProductID', 'ProductName', 'Category', 'Price']
    df['Category'] = df['Category'].str.strip()
    return df

def load_and_prepare_data(customers_file, products_file):
    """Load and prepare the data for the lookalike model"""

    customers = pd.read_csv(r"C:\Users\rahul\Downloads\Data_Science\Customers.csv")
    products = pd.read_csv(r"C:\Users\rahul\Downloads\Data_Science\Products.csv")
    

    customers = clean_customer_data(customers)
    products = clean_product_data(products)
    

    region_dummies = pd.get_dummies(customers['Region'], prefix='Region')

    n_categories = len(products['Category'].unique())
    category_preferences = pd.DataFrame(
        np.random.dirichlet(np.ones(n_categories), size=len(customers)),
        columns=products['Category'].unique(),
        index=customers.index
    )
    

    feature_df = pd.concat([
        customers[['CustomerID']],
        region_dummies,
        category_preferences
    ], axis=1)
    
    return feature_df


def create_lookalike_model(feature_df):
    """Create and return the lookalike model"""
    customer_ids = feature_df['CustomerID']
    features = feature_df.drop('CustomerID', axis=1)
    
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features)
    
    similarity_matrix = cosine_similarity(scaled_features)
    
    return customer_ids, similarity_matrix


def get_top_lookalikes(customer_id, customer_ids, similarity_matrix, n=3):
    """Get top n lookalike customers for a given customer ID"""
    customer_idx = customer_ids[customer_ids == customer_id].index[0]
    similarities = similarity_matrix[customer_idx]
    
    similar_indices = np.argsort(similarities)[::-1][1:n+1]
    
    results = []
    for idx in similar_indices:
        results.append({
            'customer_id': customer_ids.iloc[idx],
            'similarity_score': similarities[idx]
        })
    
    return results

def generate_lookalike_recommendations():
    """Generate lookalike recommendations for customers C0001-C0020"""
    feature_df = load_and_prepare_data('Customers.csv', 'Products.csv')
    customer_ids, similarity_matrix = create_lookalike_model(feature_df)
    
    recommendations = []
    for customer_id in customer_ids[customer_ids.str.match('C00[0-1][0-9]')]:
        lookalikes = get_top_lookalikes(customer_id, customer_ids, similarity_matrix)
        row = {
            'CustomerID': customer_id,
            'Lookalike1_ID': lookalikes[0]['customer_id'],
            'Lookalike1_Score': round(lookalikes[0]['similarity_score'], 4),
            'Lookalike2_ID': lookalikes[1]['customer_id'],
            'Lookalike2_Score': round(lookalikes[1]['similarity_score'], 4),
            'Lookalike3_ID': lookalikes[2]['customer_id'],
            'Lookalike3_Score': round(lookalikes[2]['similarity_score'], 4)
        }
        recommendations.append(row)
    
    result_df = pd.DataFrame(recommendations)
    result_df.to_csv('FirstName_LastName_Lookalike.csv', index=False)
    return result_df


def analyze_region_cohesion(region_features, similarity_matrix):
    """Analyze if similar customers share regional characteristics"""
    region_cohesion = []
    
    for i in range(len(similarity_matrix)):
        similar_indices = np.argsort(similarity_matrix[i])[::-1][1:4]
        customer_region = region_features.iloc[i]
        similar_regions = region_features.iloc[similar_indices]
        region_matches = np.mean([np.array_equal(customer_region, similar_region) 
                                for similar_region in similar_regions.values])
        region_cohesion.append(region_matches)
    
    return np.mean(region_cohesion)

def analyze_category_correlations(category_preferences):
    """Analyze correlations between category preferences"""
    corr_matrix = category_preferences.corr()
    return {
        'mean_correlation': np.mean(np.abs(corr_matrix.values)),
        'max_correlation': np.max(np.abs(corr_matrix.values[np.triu_indices_from(corr_matrix, k=1)]))
    }

def evaluate_model_quality(feature_df, similarity_matrix, customer_ids):
    """Evaluate the quality of the lookalike model"""

    similarity_stats = {
        'mean_similarity': np.mean(similarity_matrix),
        'std_similarity': np.std(similarity_matrix),
        'min_similarity': np.min(similarity_matrix[similarity_matrix != 1]),
        'max_similarity': np.max(similarity_matrix[similarity_matrix != 1])
    }
    

    region_cols = [col for col in feature_df.columns if col.startswith('Region_')]
    region_validation = analyze_region_cohesion(feature_df[region_cols], similarity_matrix)
    

    category_cols = [col for col in feature_df.columns 
                    if not col.startswith('Region_') and col != 'CustomerID']
    category_corr = analyze_category_correlations(feature_df[category_cols])
    
    return {
        'similarity_stats': similarity_stats,
        'region_validation': region_validation,
        'category_correlation': category_corr
    }


if __name__ == "__main__":

    recommendations = generate_lookalike_recommendations()
    print("\nLookalike Recommendations:")
    print(recommendations)
    

    feature_df = load_and_prepare_data('Customers.csv', 'Products.csv')
    customer_ids, similarity_matrix = create_lookalike_model(feature_df)
    

    evaluation_results = evaluate_model_quality(feature_df, similarity_matrix, customer_ids)
    

    print("\nModel Evaluation Results:")
    print("\n1. Similarity Score Distribution:")
    for metric, value in evaluation_results['similarity_stats'].items():
        print(f"{metric}: {value:.4f}")
    
    print(f"\n2. Region Cohesion Score: {evaluation_results['region_validation']:.4f}")
    
    print("\n3. Category Preference Analysis:")
    for metric, value in evaluation_results['category_correlation'].items():
        print(f"{metric}: {value:.4f}")


Lookalike Recommendations:
   CustomerID Lookalike1_ID  Lookalike1_Score Lookalike2_ID  Lookalike2_Score  \
0       C0001         C0032            0.9899         C0102            0.9868   
1       C0002         C0200            0.9743         C0093            0.9607   
2       C0003         C0004            0.9713         C0095            0.9577   
3       C0004         C0130            0.9854         C0003            0.9713   
4       C0005         C0045            0.9860         C0159            0.9780   
5       C0006         C0104            0.9985         C0188            0.9823   
6       C0007         C0101            0.9744         C0173            0.9278   
7       C0008         C0068            0.9782         C0034            0.9197   
8       C0009         C0023            0.9940         C0172            0.9828   
9       C0010         C0121            0.9698         C0014            0.9154   
10      C0011         C0152            0.9422         C0113            0.9361   
