In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spotify-indian-languages-datasets/Old_songs.csv
/kaggle/input/spotify-indian-languages-datasets/Gujarati_songs.csv
/kaggle/input/spotify-indian-languages-datasets/Telugu_songs.csv
/kaggle/input/spotify-indian-languages-datasets/Malayalam_songs.csv
/kaggle/input/spotify-indian-languages-datasets/Kannada_songs.csv
/kaggle/input/spotify-indian-languages-datasets/Bhojpuri_songs.csv
/kaggle/input/spotify-indian-languages-datasets/Rajasthani_songs.csv
/kaggle/input/spotify-indian-languages-datasets/Bengali_songs.csv
/kaggle/input/spotify-indian-languages-datasets/Urdu_songs.csv
/kaggle/input/spotify-indian-languages-datasets/Haryanvi_songs.csv
/kaggle/input/spotify-indian-languages-datasets/Odia_songs.csv
/kaggle/input/spotify-indian-languages-datasets/Punjabi_songs.csv
/kaggle/input/spotify-indian-languages-datasets/Hindi_songs.csv
/kaggle/input/spotify-indian-languages-datasets/Assamese_songs.csv
/kaggle/input/spotify-indian-languages-datasets/Tamil_songs.csv
/kaggle/input/sp

In [2]:
import os
import pandas as pd
import numpy as np

In [3]:
DATA_DIR = "/kaggle/input/spotify-indian-languages-datasets"

dfs = []

for file in os.listdir(DATA_DIR):
    if file.endswith(".csv"):
        df = pd.read_csv(os.path.join(DATA_DIR, file))
        df["source_language_file"] = file.replace("_songs.csv", "")
        dfs.append(df)

raw_df = pd.concat(dfs, ignore_index=True)

print("Rows before cleaning:", raw_df.shape[0])

Rows before cleaning: 35001


In [4]:
raw_df.columns = (
    raw_df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
)

In [5]:
raw_df["primary_singer"] = raw_df["singer"].str.split("|").str[0]
raw_df["primary_singer_id"] = raw_df["singer_id"].str.split("|").str[0]

In [6]:
raw_df = raw_df.dropna(
    subset=["song_name", "primary_singer_id", "released_date"]
)

In [7]:
raw_df["released_date"] = pd.to_datetime(
    raw_df["released_date"],
    format="%d-%m-%Y",
    errors="coerce"
)

raw_df = raw_df.dropna(subset=["released_date"])

raw_df["year"] = raw_df["released_date"].dt.year
raw_df["month"] = raw_df["released_date"].dt.month

In [8]:
def duration_to_seconds(x):
    try:
        m, s = x.split(":")
        return int(m) * 60 + int(s)
    except:
        return np.nan

raw_df["duration_sec"] = raw_df["duration"].apply(duration_to_seconds)

In [9]:
raw_df["duration_sec"] = raw_df.groupby("language")["duration_sec"] \
                               .transform(lambda x: x.fillna(x.median()))

raw_df["duration_sec"] = raw_df["duration_sec"].fillna(
    raw_df["duration_sec"].median()
)

In [10]:
raw_df["language"] = raw_df["language"].fillna("Unknown")

In [11]:
audio_features = [
    "danceability", "energy", "acousticness",
    "liveness", "speechiness", "valence", "tempo"
]

audio_features = [f for f in audio_features if f in raw_df.columns]

In [12]:
for col in audio_features:
    raw_df[f"{col}_missing_flag"] = raw_df[col].isna().astype(int)

In [13]:
for col in audio_features:
    raw_df[col] = raw_df.groupby("primary_singer_id")[col] \
                        .transform(lambda x: x.fillna(x.median()))

    raw_df[col] = raw_df.groupby("language")[col] \
                        .transform(lambda x: x.fillna(x.median()))

    raw_df[col] = raw_df[col].fillna(raw_df[col].median())

In [14]:
raw_df["primary_singer"] = raw_df["primary_singer"].fillna("Unknown Artist")

In [15]:
print("Final dataset shape:", raw_df.shape)
print("\nRemaining missing values:")
print(raw_df.isna().sum()[raw_df.isna().sum() > 0])

Final dataset shape: (35000, 32)

Remaining missing values:
mode              1
key               1
time_signature    1
popularity        1
stream            1
dtype: int64


In [16]:
raw_df = raw_df.dropna(
    subset=["popularity", "stream", "mode", "key", "time_signature"]
)

In [17]:
raw_df.isna().sum().max()

0

In [18]:
raw_df.to_csv("spotify_indian_languages_cleaned.csv", index=False)

In [19]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 34999 entries, 0 to 35000
Data columns (total 32 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   song_name                  34999 non-null  object        
 1   singer                     34999 non-null  object        
 2   singer_id                  34999 non-null  object        
 3   duration                   34999 non-null  object        
 4   language                   34999 non-null  object        
 5   released_date              34999 non-null  datetime64[ns]
 6   danceability               34999 non-null  float64       
 7   acousticness               34999 non-null  float64       
 8   energy                     34999 non-null  float64       
 9   liveness                   34999 non-null  float64       
 10  loudness                   34999 non-null  float64       
 11  speechiness                34999 non-null  float64       
 12  tempo    

In [20]:
raw_df["release_year"] = raw_df["released_date"].dt.year

In [21]:
raw_df["artist_song_count"] = raw_df.groupby("primary_singer_id")["song_name"].transform("count")

raw_df["artist_avg_popularity"] = raw_df.groupby("primary_singer_id")["popularity"].transform("mean")

In [22]:
raw_df["language_avg_popularity"] = raw_df.groupby("language")["popularity"].transform("mean")

In [23]:
raw_df["energy_danceability"] = raw_df["energy"] * raw_df["danceability"]

In [24]:
raw_df["hit_flag"] = (raw_df["popularity"] >= 70).astype(int)

In [25]:
raw_df.to_csv("songs_india_cleaned_fe.csv", index=False)