In [1]:
import pandas as pd

In [2]:
df=pd.read_csv('internship_data.csv')

In [3]:
df.head()

Unnamed: 0,Company Name,Title,Location,Stipend
0,Xsoln,Marketing,Jaipur,"₹ 8,000 /month"
1,Careers360,Marketing,Work From Home,"₹ 15,000 lump sum + Incentives"
2,ERIC Robotics PSIPL,Robotics,Chinchwad,"₹ 8,000-10,000 /month"
3,Internshala,Visual Design,Wasuli,"₹ 18,000 /month"
4,Scaler Academy,Content Marketing,Pune,"₹ 15,000 /month"


In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

#Creating text features
df['TextFeatures'] = df['Title'] + ' ' + df['Company Name'] + ' ' + df['Stipend'].astype(str) + ' ' + df['Location']


tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['TextFeatures'])

#Created instance
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

def get_recommendations(internship_title, top_n=5):

    idx = df.index[df['Title'] == internship_title].tolist()[0]


    sim_scores = list(enumerate(cosine_sim[idx]))


    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)


    top_recommendations = [
        {
            'Title': df['Title'].iloc[i],
            'Company Name': df['Company Name'].iloc[i],
            'Stipend': df['Stipend'].iloc[i],
            'Location': df['Location'].iloc[i]
        }
        for i, score in sim_scores[1:top_n+1]
    ]

    return top_recommendations

internship_title_to_recommend = 'Digital Marketing'
recommendations = get_recommendations(internship_title_to_recommend)

if recommendations:
    print(f"Top {len(recommendations)} recommended internships for {internship_title_to_recommend}:")
    for recommendation in recommendations:
        print(recommendation)

Top 5 recommended internships for Digital Marketing:
{'Title': 'Digital Marketing', 'Company Name': 'Lead Mines Media', 'Stipend': 10000.0, 'Location': 'Kolkata'}
{'Title': 'Social Media Marketing', 'Company Name': 'Lead Mines Media', 'Stipend': 10000.0, 'Location': 'Work From Home'}
{'Title': 'Social Media Marketing', 'Company Name': 'Lead Mines Media', 'Stipend': 10000.0, 'Location': 'Ghaziabad'}
{'Title': 'Telecalling', 'Company Name': 'Lead Mines Media', 'Stipend': 8000.0, 'Location': 'Ghaziabad'}
{'Title': 'Telemarketing - English', 'Company Name': 'Lead Mines Media', 'Stipend': 8000.0, 'Location': 'Work From Home'}


In [72]:
import pickle

model_filename = 'sample_data/recommendation_model1.pkl'
with open(model_filename, 'wb') as model_file:
    pickle.dump(cosine_sim, model_file)

print(f"Model saved as {model_filename}")

Model saved as sample_data/recommendation_model1.pkl


In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import pandas as pd
from typing import List, Dict, Union

class InternshipRecommendationModel:
    def __init__(self):
        self.df = None
        self.tfidf_vectorizer = None
        self.cosine_sim = None

    def preprocess_data(self, df: pd.DataFrame):

        df['TextFeatures'] = df['Title'] + ' ' + df['Company Name'] + ' ' + df['Stipend'].astype(str) + ' ' + df['Location']
        return df

    def train_model(self, df: pd.DataFrame):
        df = self.preprocess_data(df)


        self.tfidf_vectorizer = TfidfVectorizer(stop_words='english')
        tfidf_matrix = self.tfidf_vectorizer.fit_transform(df['TextFeatures'])


        self.cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

        self.df = df

    def get_recommendations(self, internship_title: str, internship_location: str, top_n: int = 5) -> List[Dict[str, Union[str, float]]]:
        if self.df is None or self.tfidf_vectorizer is None or self.cosine_sim is None:
            raise ValueError("Model not trained. Use train_model() method first.")


        idx = self.df.index[self.df['Title'] == internship_title].tolist()[0]


        sim_scores = list(enumerate(self.cosine_sim[idx]))

        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)


        top_recommendations = [
            {
                'Title': self.df['Title'].iloc[i],
                'Company Name': self.df['Company Name'].iloc[i],
                'Stipend': self.df['Stipend'].iloc[i],
                'Location': self.df['Location'].iloc[i]
            }
            for i, score in sim_scores[1:top_n+1]
            if self.df['Location'].iloc[i] == internship_location
        ]

        return top_recommendations


model = InternshipRecommendationModel()


df=pd.read_csv('internship_data.csv')

model.train_model(df)


internship_title_to_recommend = 'Sales'
internship_location_to_recommend = 'Mumbai'
recommendations = model.get_recommendations(internship_title_to_recommend, internship_location_to_recommend)

if recommendations:
    print(f"Top {len(recommendations)} recommended internships for {internship_title_to_recommend} in {internship_location_to_recommend}:")
    for recommendation in recommendations:
        print(recommendation)
else:
  print("There is no such match.Please try any other matches")

There is no such match.Please try any other matches


In [73]:
import pickle

model_filename = 'recommendation_model2.pkl'
with open(model_filename, 'wb') as model_file:
    pickle.dump(cosine_sim, model_file)

print(f"Model saved as {model_filename}")

Model saved as recommendation_model2.pkl
