In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("gaanasongs.csv")

In [3]:
df.head()

Unnamed: 0,name,singer,singer_id,duration,link,language
0,Dil - E - Nadan Tujhe,Chitra Singh|Jagjit Singh,/artist/chitra-singh|/artist/jagjeet-singh-1,05:00,/dil-e-nadan-tujhe-3,Urdu
1,Agar Hum Kahen Aur Woh Muskara De,Chitra Singh|Jagjit Singh,/artist/chitra-singh|/artist/jagjeet-singh-1,06:26,/agar-hum-kahen-aur-woh-muskura-den,Urdu
2,Unke Dekhe Se,Jagjit Singh,/artist/jagjeet-singh-1,03:41,/unke-dekhe-se,Urdu
3,Yeh Na Thi Hamari Qismat - Chitra Singh,Chitra Singh,/artist/chitra-singh,04:26,/yeh-na-thi-hamari-qismat-2,Urdu
4,Hazaron Khwahishen Aisi,Jagjit Singh,/artist/jagjeet-singh-1,05:39,/hazaron-khwahishen-aisi-1,Urdu


In [4]:
df.shape

(41355, 6)

In [5]:
df.describe()

Unnamed: 0,name,singer,singer_id,duration,link,language
count,41355,41355,41355,41355,41355,41355
unique,31534,6196,6580,1309,36361,16
top,O Mere Dil Ke Chain,Lata Mangeshkar,/artist/lata-mangeshkar,03:02,/kya-khoob-lagti-ho,Telugu
freq,26,1410,1410,447,2,4996


In [6]:
df.isnull()

Unnamed: 0,name,singer,singer_id,duration,link,language
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
...,...,...,...,...,...,...
41350,False,False,False,False,False,False
41351,False,False,False,False,False,False
41352,False,False,False,False,False,False
41353,False,False,False,False,False,False


In [7]:
df.isnull().sum()

name         0
singer       0
singer_id    0
duration     0
link         0
language     0
dtype: int64

In [8]:
def preprocess_data(df):
    df = df.drop_duplicates(subset=['name', 'singer'])
    df.dropna(subset=['name', 'singer', 'duration', 'link', 'language'], inplace=True)
    df['duration'] = pd.to_numeric(df['duration'], errors='coerce')
    df.dropna(subset=['duration'], inplace=True)
    return df

In [9]:
def eda(df):
    top_singers = df['singer'].value_counts().head(10)
    languages = df['language'].value_counts()
    return top_singers, languages

In [10]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
def collaborative_filtering(df, fav_singer):
    recommendations = df[df['singer'].str.contains(fav_singer, case=False, na=False)]
    return recommendations[['name', 'singer', 'link']].head(10)

In [12]:
def content_based_filtering(df, fav_language):
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(df['language'])
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    indices = pd.Series(df.index, index=df['language']).drop_duplicates()
    idx = indices.get(fav_language)
    if idx is None:
        return pd.DataFrame(columns=['name', 'singer', 'link'])

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    song_indices = [i[0] for i in sim_scores[1:11]]

    return df.iloc[song_indices][['name', 'singer', 'link']]

In [13]:
import matplotlib.pyplot as plt

def show_language_distribution(df):
    language_counts = df['language'].value_counts()
    
    # Plotting
    fig, ax = plt.subplots()
    language_counts.plot(kind='bar', ax=ax)
    ax.set_title('Language Distribution')
    ax.set_xlabel('Language')
    ax.set_ylabel('Number of Songs')
    
    return fig
