In [16]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
# loading the dataset
df = pd.read_csv('/kaggle/input/startups/startups.csv')
df.head()

Unnamed: 0,id,social_impact,environmental_impact,stage,risk_tolerance,investment_duration
0,1,Education,Renewable Energy,Idea Phase,2,3
1,2,Healthcare,Sustainable Agriculture,Prototype/MVP,1,1
2,3,Poverty Alleviation,Clean Technology,Early Stage,3,2
3,4,Gender Equality,Biodiversity Conservation,Growth Stage,2,3
4,5,Social Justice and Equality,Climate Change Mitigation and Adaptation,Mature Stage,1,1


In [18]:
# Preprocess the data
df['impact'] = df['social_impact'] + ' ' + df['environmental_impact']

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['impact'])

# Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [72]:
def get_recommendations(investor_prefs, top_n=5):
    # Filter the dataset based on investor preferences
    filtered_df = df[(df['risk_tolerance'] == investor_prefs['risk_tolerance']) & 
                     (df['investment_duration'] == investor_prefs['investment_duration'])]
    
    # Compute TF-IDF for the filtered dataset
    filtered_tfidf_matrix = tfidf_vectorizer.transform(filtered_df['impact'])
    
    # Compute cosine similarity with the filtered dataset
    cosine_sim_filtered = cosine_similarity(filtered_tfidf_matrix, tfidf_matrix)
    
    # Get indices of the top recommendations
    indices = cosine_sim_filtered.argsort()[:, ::-1][:, :top_n]
    
    # Get startup recommendations
    recommendations = []
    weights = []
    
    for (idx, startup) in enumerate(df.values):
        weight = 0
        if startup[1] == investor_prefs["social_impact"] and startup[2] == investor_prefs["environmental_impact"]:
            weight += 30
        else:
            if startup[1] == investor_prefs["social_impact"] or startup[2] == investor_prefs["environmental_impact"]:
                weight += 15
        weight -= 5*abs(startup[4]-investor_prefs["risk_tolerance"])
        weight -= 5*abs(startup[5]-investor_prefs["investment_duration"])
        if weight >= 10:
            weights.append((idx, weight))
            recommendations.append(idx)
                                   
    weights = sorted(weights, key=lambda x: x[1])[::-1][:5]
    
    print(weights)
        
    
    return weights

In [80]:
investor_prefs = {
    'social_impact': 'Education',
    'environmental_impact': 'Renewable Energy',
    'risk_tolerance': 3,
    'investment_duration': 2
}

# Get recommendations
recommendations = get_recommendations(investor_prefs)
print("Investor's Startup Recommendations:")
print(f"For investor's area of interest in {investor_prefs['social_impact']} and risk tolerance {investor_prefs['risk_tolerance']}, recommended startups are: {recommendations}")

for k, v in recommendations:
    print(df.iloc[[k]])

[(17, 25), (10, 25), (0, 20), (46, 15), (40, 15)]
Investor's Startup Recommendations:
For investor's area of interest in Education and risk tolerance 3, recommended startups are: [(17, 25), (10, 25), (0, 20), (46, 15), (40, 15)]
    id social_impact environmental_impact        stage  risk_tolerance  \
17  18     Education     Renewable Energy  Early Stage               2   

    investment_duration                      impact  
17                    2  Education Renewable Energy  
    id social_impact environmental_impact       stage  risk_tolerance  \
10  11     Education     Renewable Energy  Idea Phase               3   

    investment_duration                      impact  
10                    1  Education Renewable Energy  
   id social_impact environmental_impact       stage  risk_tolerance  \
0   1     Education     Renewable Energy  Idea Phase               2   

   investment_duration                      impact  
0                    3  Education Renewable Energy  
    id s