In [21]:
import pandas as pd
from tqdm import tqdm

In [22]:
df = pd.read_csv('songs.csv')

In [23]:
df.head()

Unnamed: 0,name,singer,singer_id,duration,link,language
0,Dil - E - Nadan Tujhe,Chitra Singh|Jagjit Singh,/artist/chitra-singh|/artist/jagjeet-singh-1,05:00,/dil-e-nadan-tujhe-3,Urdu
1,Agar Hum Kahen Aur Woh Muskara De,Chitra Singh|Jagjit Singh,/artist/chitra-singh|/artist/jagjeet-singh-1,06:26,/agar-hum-kahen-aur-woh-muskura-den,Urdu
2,Unke Dekhe Se,Jagjit Singh,/artist/jagjeet-singh-1,03:41,/unke-dekhe-se,Urdu
3,Yeh Na Thi Hamari Qismat - Chitra Singh,Chitra Singh,/artist/chitra-singh,04:26,/yeh-na-thi-hamari-qismat-2,Urdu
4,Hazaron Khwahishen Aisi,Jagjit Singh,/artist/jagjeet-singh-1,05:39,/hazaron-khwahishen-aisi-1,Urdu


In [25]:
len(df)

41355

## Checking Null values

In [4]:
df.isnull().sum()

name         0
singer       0
singer_id    0
duration     0
link         0
language     0
dtype: int64

## 2. Data Preprocess
### 2.1) Dealing with Duration

In [6]:
duration_lst = []

for duration in df['duration']:

  m = int(duration.split(':')[0])
  s = int(duration.split(':')[1])
  sec = (m*60) + s

  duration_lst.append(sec)

df['duration'] = duration_lst

df.head()

Unnamed: 0,name,singer,singer_id,duration,link,language
0,Dil - E - Nadan Tujhe,Chitra Singh|Jagjit Singh,/artist/chitra-singh|/artist/jagjeet-singh-1,300,/dil-e-nadan-tujhe-3,Urdu
1,Agar Hum Kahen Aur Woh Muskara De,Chitra Singh|Jagjit Singh,/artist/chitra-singh|/artist/jagjeet-singh-1,386,/agar-hum-kahen-aur-woh-muskura-den,Urdu
2,Unke Dekhe Se,Jagjit Singh,/artist/jagjeet-singh-1,221,/unke-dekhe-se,Urdu
3,Yeh Na Thi Hamari Qismat - Chitra Singh,Chitra Singh,/artist/chitra-singh,266,/yeh-na-thi-hamari-qismat-2,Urdu
4,Hazaron Khwahishen Aisi,Jagjit Singh,/artist/jagjeet-singh-1,339,/hazaron-khwahishen-aisi-1,Urdu


## 3. Data Analysis
### 3.1) Average Duration of a song thorughout the dataset

In [7]:
print("Average duration of a song is", int(df['duration'].sum() / len(df['duration'])), 'seconds')

Average duration of a song is 295 seconds


### 3.2) How many laguages of songs are there is the dataset?


In [8]:
len(set(df['language']))

16

### 3.3) What are all the languages in the dataset?

In [9]:
for i in set(df['language']):
  print(i)

Bengali
Urdu
Tamil
Old
Gujarati
Kannada
Bhojpuri
Hindi
Telugu
Assamese
Odia
Haryanvi
Marathi
Malayalam
Punjabi
Rajasthani


## 3.4) How many Punjabi songs are there?

In [10]:
c = 0

for i in df['language']:
  if (i == 'Punjabi'):
    c += 1

print('There are',c,'songs in Punjabi')

There are 3818 songs in Punjabi


In [11]:
len(df[df['language'] == 'Punjabi'])

3818

## 3.5) Number of songs in each Language?

In [13]:
for lang in df['language'].unique():
  print(lang, len(df[df['language'] == lang]))

Urdu 3116
Gujarati 2115
Tamil 4677
Kannada 3559
Rajasthani 541
Haryanvi 228
Punjabi 3818
Hindi 4993
Old 4993
Malayalam 479
Marathi 4699
Telugu 4996
Odia 940
Bhojpuri 519
Assamese 724
Bengali 958


## 3.6) Average Duration of song in each language

In [14]:
for lang in df['language'].unique():
  print(lang, int(df[df['language'] == lang]['duration'].mean()))

Urdu 407
Gujarati 271
Tamil 270
Kannada 250
Rajasthani 291
Haryanvi 320
Punjabi 324
Hindi 307
Old 307
Malayalam 242
Marathi 281
Telugu 279
Odia 257
Bhojpuri 357
Assamese 228
Bengali 255


### 3.7) How many unique singers are there in the dataset?

In [15]:
all_singer = []
for singers in df['singer']:
  all_singer += singers.split('|')

print('Total Singers : ', len(set(all_singer)))

Total Singers :  3940


### 3.8) Singer who recorded most no of songs ?

In [17]:
data = []

unique_singers = list(set(all_singer))

for i in tqdm(unique_singers):

  c = 0
  for singers_ in df['singer']:
    if (i in singers_):
      c += 1

  data.append([i,c])

100%|██████████████████████████████████████████████████████████████████████████████| 3940/3940 [00:41<00:00, 95.32it/s]


In [19]:
songs = pd.DataFrame(data, columns = ['singer','songs'])

songs.head()

songs.sort_values(by = 'songs', ascending = False).head(20)

Unnamed: 0,singer,songs
2027,S. P. Balasubrahmanyam,3659
330,Kumar,3270
582,Susheel,3216
1608,Susheela,3215
2003,P. Susheela,3213
314,Lata Mangeshkar,3056
731,Asha Bhosle,2715
204,Kishore Kumar,2401
920,Janaki,2218
569,S. Janaki,2204


### 3.9) Singer who played most number of songs in each language?

In [20]:
for lang in df['language'].unique():

  df_ = df[df['language'] == lang]
  
  all_singer = []
  for singers in df_['singer']:
    all_singer += singers.split('|')

  print(lang, len(df_),len(set(all_singer)))

Urdu 3116 385
Gujarati 2115 303
Tamil 4677 510
Kannada 3559 379
Rajasthani 541 218
Haryanvi 228 76
Punjabi 3818 636
Hindi 4993 515
Old 4993 515
Malayalam 479 72
Marathi 4699 569
Telugu 4996 369
Odia 940 243
Bhojpuri 519 109
Assamese 724 228
Bengali 958 192
