In [None]:
pip install faker

Collecting faker
  Downloading Faker-23.2.1-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faker
Successfully installed faker-23.2.1


In [None]:
import pandas as pd
import numpy as np

# Load names from the provided Excel file
file_path = '/content/Corrected_Indian_Names.xlsx'
names_df = pd.read_excel(file_path)

# Extracting lists of names and last names
male_first_names = names_df['Male Names'].dropna().tolist()
female_first_names = names_df['Female Names'].dropna().tolist()
last_names = names_df['Last Names'].dropna().tolist()

# Total number of synthetic athlete profiles to generate
num_athletes = 10000

# Helper functions for BMI calculation and training years
def calculate_weight_for_bmi(height_cm, gender, min_bmi=18, max_bmi=23.6):
    if gender == 'Female' and height_cm > 183:  # Adjust height for female athletes if it exceeds the limit
        height_cm = np.random.randint(155, 184)
    min_weight_kg = (min_bmi * (height_cm / 100) ** 2)
    max_weight_kg = (max_bmi * (height_cm / 100) ** 2)
    if gender == 'Female':
        return np.random.randint(max(40, np.ceil(min_weight_kg)), min(70, np.floor(max_weight_kg) + 1))
    return np.random.randint(np.ceil(min_weight_kg), np.floor(max_weight_kg) + 1)

def calculate_training_years(current_age):
    start_age = np.random.randint(16, min(21, current_age))  # Athletes start training between ages 16 and 20
    return current_age - start_age

# Education levels and their corresponding weights
education_levels = ['High School', 'Bachelor', 'Master', 'PhD']
education_weights = [0.20, 0.60, 0.12, 0.08]

# Generating synthetic athlete data
athlete_data = []
for _ in range(num_athletes):
    gender = 'Male' if np.random.rand() < 0.65 else 'Female'
    first_name = np.random.choice(male_first_names if gender == 'Male' else female_first_names)
    last_name = np.random.choice(last_names)
    name = f"{first_name} {last_name}"
    age = np.random.randint(18, 40)
    height = np.random.randint(155, 210) if gender == 'Male' else np.random.randint(155, 184)
    weight = calculate_weight_for_bmi(height, gender)
    education_level = np.random.choice(education_levels, p=education_weights)
    training_years = calculate_training_years(age)
    level = 'Amateur' if training_years < 3 else ('Semi-Pro' if training_years < 7 else 'Professional')
    sport = 'Basketball' if ((gender == 'Male' and height > 190) or (gender == 'Female' and height >= 180)) else np.random.choice(['Soccer', 'Tennis', 'Swimming', 'Athletics', 'Cricket', 'Football', 'Wrestling', 'F1-Racing'])

    athlete_data.append([
        name, gender, age, height, weight, education_level, training_years, level, sport,
        np.random.choice(['Regional', 'State-Level', 'National', 'International']),  # Achievements
        np.random.choice(['Ahmedabad', 'Bangalore', 'Chennai', 'Delhi', 'Hyderabad', 'Jaipur', 'Kolkata', 'Mumbai', 'Pune', 'Surat', 'Lucknow', 'Kanpur', 'Nagpur', 'Indore', 'Thane', 'Bhopal', 'Visakhapatnam', 'Patna', 'Vadodara', 'Ghaziabad', 'Ludhiana', 'Agra', 'Nashik', 'Faridabad', 'Meerut', 'Rajkot', 'Varanasi', 'Srinagar', 'Aurangabad', 'Dhanbad', 'Amritsar', 'Allahabad']),  # Location
        np.random.choice(['Equipment', 'Financial', 'Nutrition', 'Apparel', 'Coaching']),  # Desired Sponsorship Type
        np.random.randint(100, 100000),  # Social Media Followers
        np.random.choice(['100k-200k', '200k-300k', '300k-400k', '400k-500k', '600k-700k', 'Above 700k'])  # Annual Income
    ])

# Convert list of data into a DataFrame
columns = ['Name', 'Gender', 'Age', 'Height (cm)', 'Weight (kg)', 'Education Level', 'Training History Years', 'Level', 'Sport', 'Achievements', 'Location', 'Desired Sponsorship Type', 'Social Media Followers', 'Annual Income']
athlete_df = pd.DataFrame(athlete_data, columns=columns)

# Display the first few rows of the DataFrame to verify the output
print(athlete_df.head())

# Save the DataFrame to a CSV file
athlete_df.to_csv('enhanced_synthetic_athlete_data.csv', index=False)

# Provide the file path for download


               Name  Gender  Age  Height (cm)  Weight (kg) Education Level  \
0         Tanvi Sen  Female   38          183           69        Bachelor   
1  Vishal Mukherjee    Male   37          191           82        Bachelor   
2      Anmol Mishra    Male   39          188           78          Master   
3      Tanvi Khanna  Female   36          156           49        Bachelor   
4      Yogesh Reddy    Male   26          186           71        Bachelor   

   Training History Years         Level       Sport   Achievements  Location  \
0                      19  Professional  Basketball       Regional  Srinagar   
1                      19  Professional  Basketball    State-Level     Patna   
2                      21  Professional      Tennis       Regional   Dhanbad   
3                      19  Professional   Wrestling  International     Delhi   
4                       6      Semi-Pro      Tennis    State-Level  Amritsar   

  Desired Sponsorship Type  Social Media Followers

In [10]:
import pandas as pd
import numpy as np

# Assume these are the sports that sponsors might be interested in
sports_list = ['Soccer', 'Basketball', 'Tennis', 'Swimming', 'Athletics', 'Cricket', 'Football', 'Wrestling', 'F1-Racing']

# Total number of synthetic sponsor profiles to generate
num_sponsors = 1000

# Generating synthetic sponsor data
sponsor_data = []
for _ in range(num_sponsors):
    company_name = f"{np.random.choice(['Tech', 'Sports', 'Health', 'Food', 'Fashion', 'Finance'])} {np.random.choice(['Corporation', 'Group', 'LLC', 'Inc.', 'Ltd.'])}"
    industry = np.random.choice(['Technology', 'Sports', 'Healthcare', 'Food & Beverage', 'Fashion', 'Finance'])
    budget_range = np.random.choice(['1,000-5,000', '5,000-20,000', '20,000-100,000', 'Over 100,000'])
    location = np.random.choice(['Ahmedabad', 'Bangalore', 'Chennai', 'Delhi', 'Hyderabad', 'Jaipur', 'Kolkata', 'Mumbai', 'Pune', 'Surat', 'Lucknow', 'Kanpur', 'Nagpur', 'Indore', 'Thane', 'Bhopal', 'Visakhapatnam', 'Patna', 'Vadodara', 'Ghaziabad', 'Ludhiana', 'Agra', 'Nashik', 'Faridabad', 'Meerut', 'Rajkot', 'Varanasi', 'Srinagar', 'Aurangabad', 'Dhanbad', 'Amritsar', 'Allahabad'])
    target_audience = np.random.choice(['Teens', 'Young Adults', 'Adults', 'Seniors'])
    marketing_goals = np.random.choice(['Brand Awareness', 'Sales Increase', 'Market Expansion'])
    commitment_level_years = round(np.random.uniform(0.5, 5), 2)
    preferred_sport = np.random.choice(sports_list)

    # Append sponsor profile to the list
    sponsor_data.append([company_name, industry, budget_range, location, target_audience, marketing_goals, commitment_level_years, preferred_sport])

# Convert list of data into a DataFrame
columns = ['Company Name', 'Industry', 'Budget Range', 'Location', 'Target Audience', 'Marketing Goals', 'Commitment Level Years', 'Preferred Sport']
sponsor_df = pd.DataFrame(sponsor_data, columns=columns)

# Display the first few rows of the DataFrame to verify the output
print(sponsor_df.head())

# Save the DataFrame to a CSV file
output_file_sponsors = '/mnt/data/synthetic_sponsor_data.csv'
sponsor_df.to_csv('enhanced_synthetic_sponsor_data.csv', index=False)

# Provide the file path for download
output_file_sponsors
# Re-import necessary libraries and reload data due to reset
import pandas as pd

# Load the sponsor dataset again
sponsor_data_path = '/mnt/data/enhanced_synthetic_sponsor_data (2).csv'
sponsor_data = pd.read_csv(sponsor_data_path)

# Replace the company names with unique names
sponsor_data['Company Name'] = 'Company_' + sponsor_data.index.astype(str)

# Save the modified dataset
modified_sponsor_data_path = '/mnt/data/modified_enhanced_synthetic_sponsor_data.csv'
sponsor_data.to_csv(modified_sponsor_data_path, index=False)

# Return the path to the modified file
modified_sponsor_data_path


   Company Name         Industry    Budget Range       Location  \
0      Tech LLC  Food & Beverage    Over 100,000      Ahmedabad   
1    Tech Group          Fashion  20,000-100,000  Visakhapatnam   
2  Finance Inc.       Technology  20,000-100,000      Ghaziabad   
3     Food Inc.          Fashion     1,000-5,000       Ludhiana   
4  Health Group       Healthcare  20,000-100,000       Varanasi   

  Target Audience   Marketing Goals  Commitment Level Years Preferred Sport  
0         Seniors    Sales Increase                    4.85      Basketball  
1         Seniors  Market Expansion                    0.76          Tennis  
2           Teens    Sales Increase                    4.09       Wrestling  
3         Seniors   Brand Awareness                    4.09          Soccer  
4          Adults  Market Expansion                    1.00      Basketball  


'/mnt/data/synthetic_sponsor_data.csv'

In [33]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer

# Load the datasets
athlete_data_path = '/content/enhanced_synthetic_athlete_data (6).csv'
sponsor_data_path = '/content/manually_modified_enhanced_synthetic_sponsor_data.csv'
athlete_data = pd.read_csv(athlete_data_path)
sponsor_data = pd.read_csv(sponsor_data_path)

# Basic cleaning and preprocessing
athlete_data.drop_duplicates(inplace=True)
sponsor_data.drop_duplicates(inplace=True)
athlete_data.fillna(athlete_data.median(numeric_only=True), inplace=True)
sponsor_data.fillna(sponsor_data.median(numeric_only=True), inplace=True)

# Creating a subset of the athlete data for demonstration
athlete_subset = athlete_data.sample(n=1000, random_state=42)

# Feature Engineering
def engineer_features(athletes, sponsors):
    # Convert categorical features to numerical codes and normalize
    scaler = MinMaxScaler()
    categorical_columns = ['Location', 'Sport', 'Education Level', 'Level']
    for column in categorical_columns:
        if column in athletes.columns:
            athletes[column + ' Code'] = pd.Categorical(athletes[column]).codes
        if column in sponsors.columns:
            sponsors[column + ' Code'] = pd.Categorical(sponsors[column]).codes

    # Normalize features
    athlete_features = [col + ' Code' for col in categorical_columns if col + ' Code' in athletes.columns]
    sponsor_features = [col + ' Code' for col in categorical_columns if col + ' Code' in sponsors.columns]

    if athlete_features:
        athletes[athlete_features] = scaler.fit_transform(athletes[athlete_features])
    if sponsor_features:
        sponsors[sponsor_features] = scaler.fit_transform(sponsors[sponsor_features])

    return athletes, sponsors

athlete_subset, sponsor_data = engineer_features(athlete_subset, sponsor_data)

# Compatibility score calculation
def calculate_compatibility_scores(athletes, sponsors, vectorizer):
    compatibility_scores = []
    for _, athlete in athletes.iterrows():
        for _, sponsor in sponsors.iterrows():
            # Advanced compatibility calculations
            scores = {
                'location_score': 1 - abs(athlete.get('Location Code', 0) - sponsor.get('Location Code', 0)),
                'sport_score': 1 - abs(athlete.get('Sport Code', 0) - sponsor.get('Preferred Sport Code', 0)),
                'level_score': 1 - abs(athlete.get('Level Code', 0) - sponsor.get('Level Code', 0)),
                # Additional scores can be added here based on other features
            }
            # Calculate average of the scores
            total_score = sum(scores.values()) / len(scores)
            compatibility_scores.append({
                'Athlete': athlete['Name'],
                'Sponsor': sponsor['Company Name'],
                'Compatibility Score': total_score
            })
    return pd.DataFrame(compatibility_scores)

# Vectorize 'Desired Sponsorship Type' and 'Target Audience'
vectorizer = CountVectorizer()
athlete_target_audience = vectorizer.fit_transform(athlete_subset['Desired Sponsorship Type'].astype(str))
sponsor_target_audience = vectorizer.transform(sponsor_data['Target Audience'].astype(str))

# Add vectorized features back to the athlete and sponsor dataframes for compatibility calculation
for feature in vectorizer.get_feature_names_out():
    athlete_subset[feature] = athlete_target_audience.toarray()[:, vectorizer.vocabulary_.get(feature, 0)]
    sponsor_data[feature] = sponsor_target_audience.toarray()[:, vectorizer.vocabulary_.get(feature, 0)]

# Calculate and rank compatibility scores
compatibility_scores_df = calculate_compatibility_scores(athlete_subset, sponsor_data, vectorizer)
top_pairs = compatibility_scores_df.sort_values(by='Compatibility Score', ascending=False).head(10)

# Display the top 10 athlete-sponsor pairs based on compatibility scores
print(top_pairs)


                    Athlete            Sponsor  Compatibility Score
856658        Vivaan Bhatia  UniqueCompany_658                  1.0
856968        Vivaan Bhatia  UniqueCompany_968                  1.0
273443         Pranav Dalal  UniqueCompany_443                  1.0
615222            Shiv Saha  UniqueCompany_222                  1.0
615933            Shiv Saha  UniqueCompany_933                  1.0
606399          Mamta Mehta  UniqueCompany_399                  1.0
976240  Vedika Bandopadhyay  UniqueCompany_240                  1.0
438950         Abhinav Jain  UniqueCompany_950                  1.0
114326          Krish Menon  UniqueCompany_326                  1.0
976573  Vedika Bandopadhyay  UniqueCompany_573                  1.0
