In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from flask import Flask, request, render_template

app = Flask(__name__)

# Load and preprocess data (this part remains largely the same)
investor_ds = pd.read_csv('Investor_data.csv')
startup_ds = pd.read_csv('P18_Startup_Data.csv')

# Handle Missing Data
investor_ds['Preferred Industry'].fillna('Not Specified', inplace=True)
investor_ds['Preferred Stage'].fillna('Not Specified', inplace=True)
investor_ds.replace('not specified', np.nan, inplace=True)

# Drop duplicates
investor_ds.drop_duplicates(subset='Investor ID', keep='first', inplace=True)

industries = ['Aerospace', 'Healthcare and Biotechnology', 'Life Sciences', 'Food and Beverage', 'Financial Technology',
              'Legal Technology', 'Real Estate Technology', 'Education Technology', 'Transportation and Mobility',
              'Agriculture Technology', 'Travel and Hospitality Technology', 'Artificial Intelligence and Machine Learning',
              'Gaming and Esports', 'Supply Chain and Logistics Technology', 'Social Media and Networking', 'E-Commerce and Online Retail']

for industry in industries:
    investor_ds[industry] = investor_ds['Preferred Industry'].apply(lambda x: 1 if industry in x.split(', ') else 0)

def convert_to_numeric(amount_str):
    if isinstance(amount_str, str):
        amount_str = amount_str.strip()
        if 'to' in amount_str:
            start, end = amount_str.split('to')
            start_value = float(start.strip())
            end_value = float(end.strip())
            return (start_value + end_value) / 2
        return float(amount_str)

investor_ds['Preffered Investment Amount'] = investor_ds['Preffered Investment Amount'].apply(convert_to_numeric)

stages = investor_ds['Preferred Stage'].str.split(',', expand=True).stack().unique()
stages = [stage.strip() for stage in stages if pd.notna(stage)]

for stage in stages:
    investor_ds[stage] = investor_ds['Preferred Stage'].str.contains(stage, case=False, na=False).astype(int)

investor_ds['All industries'] = investor_ds['Preferred Industry'].str.contains('All industries', case=False, na=False).astype(int)

startup_ds = pd.read_csv('P18_Startup_Data.csv')

startup_ds['Industry'].fillna('Not Specified', inplace=True)
startup_ds['Description'].fillna('Not Specified', inplace=True)

startup_ds.drop_duplicates(subset='Startup ID', keep='first', inplace=True)

for industry in industries:
    startup_ds[industry] = startup_ds['Industry'].apply(lambda x: 1 if industry in x.split(',') else 0)

startup_ds['Amount Seeking'] = pd.to_numeric(startup_ds['Amount Seeking'], errors='coerce')

stages = startup_ds['Stage'].str.split(',', expand=True).stack().unique()
stages = [stage.strip() for stage in stages if pd.notna(stage)]

for stage in stages:
    startup_ds[stage] = startup_ds['Stage'].str.contains(stage, case=False, na=False).astype(int)

# Drop non-numeric columns and handle categorical data
investor_features = investor_ds.drop(columns=['Investor ID', 'Name', 'Email'])
startup_features = startup_ds.drop(columns=['Startup ID', 'Name', 'Description'])

common_categorical_columns = list(set(investor_features.select_dtypes(include=['object']).columns) &
                                  set(startup_features.select_dtypes(include=['object']).columns))

encoder = OneHotEncoder(drop='first', sparse_output=False)
investor_encoded = encoder.fit_transform(investor_features[common_categorical_columns])
startup_encoded = encoder.transform(startup_features[common_categorical_columns])

investor_encoded_df = pd.DataFrame(investor_encoded, columns=encoder.get_feature_names_out(common_categorical_columns))
startup_encoded_df = pd.DataFrame(startup_encoded, columns=encoder.get_feature_names_out(common_categorical_columns))

investor_numeric = investor_features.select_dtypes(include=[np.number])
startup_numeric = startup_features.select_dtypes(include=[np.number])

common_numeric_columns = investor_numeric.columns.intersection(startup_numeric.columns)
investor_numeric = investor_numeric[common_numeric_columns]
startup_numeric = startup_numeric[common_numeric_columns]

scaler = StandardScaler()
investor_numeric_scaled = scaler.fit_transform(investor_numeric)
startup_numeric_scaled = scaler.transform(startup_numeric)

investor_combined = np.hstack([investor_numeric_scaled, investor_encoded])
startup_combined = np.hstack([startup_numeric_scaled, startup_encoded])

@app.route('/')
def form():
    return render_template('startup_form.html')  # HTML for UI 

@app.route('/recommend', methods=['POST'])
def recommend():
    try:
        industry = request.form['Industry']
        stage = request.form['Stage']
        amount_seeking = request.form['Amount Seeking']

        # Handle the amount seeking range
        if amount_seeking == '5000000+':
            amount_value = 5000001
        else:
            amount_range = amount_seeking.split('-')
            amount_value = (float(amount_range[0]) + float(amount_range[1])) / 2

        # Create a new startup entry
        new_startup = pd.DataFrame({
            'Industry': [industry],
            'Stage': [stage],
            'Amount Seeking': [amount_value]
        })

        # Add industry columns
        for ind in industries:
            new_startup[ind] = 1 if ind in industry.split(', ') else 0

        # Add stage columns
        for stg in stages:
            new_startup[stg] = 1 if stg in stage else 0

        # Add 'All industries' column
        new_startup['All industries'] = 1 if 'All industries' in industry else 0

        # Ensure all columns from startup_features are present
        for col in startup_features.columns:
            if col not in new_startup.columns:
                new_startup[col] = 0

        # Reorder columns to match startup_features
        new_startup = new_startup[startup_features.columns]

        # Encode categorical data
        new_startup_encoded = encoder.transform(new_startup[common_categorical_columns])
        new_startup_encoded_df = pd.DataFrame(new_startup_encoded, columns=encoder.get_feature_names_out(common_categorical_columns))

        # Scale numeric data
        new_startup_numeric = new_startup[common_numeric_columns]
        new_startup_numeric_scaled = scaler.transform(new_startup_numeric)

        # Combine encoded and scaled data
        new_startup_combined = np.hstack([new_startup_numeric_scaled, new_startup_encoded])

        # Calculate similarity with investors
        similarity_matrix = cosine_similarity(new_startup_combined, investor_combined)
        
        
        similarity_df = pd.DataFrame(similarity_matrix, index=[0], columns=investor_ds['Investor ID'])
        recommendations = similarity_df.iloc[0].nlargest(15)

        # Fetch investor details for the top recommendations
        top_investors = investor_ds[investor_ds['Investor ID'].isin(recommendations.index)]
        top_investors['Similarity Score'] = recommendations.values
        top_investors = top_investors[['Name', 'Email', 'Preferred Industry', 'Preferred Stage', 'Preffered Investment Amount', 'Similarity Score']]

        return top_investors.to_html(index=False)

        new_investor.to_csv('new_investors.csv', mode='a', header=False, index=False)
    
    except ValueError as e:
        return f"Error processing input: {str(e)}", 400
    except Exception as e:
        return f"An unexpected error occurred: {str(e)}", 500

if __name__ == '__main__':
    app.run(debug=True, port=5009)
