In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
df=pd.read_csv('data/merged_data.csv')

In [3]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,Year,Film,Cast,Director,Genre
0,0,1964,Aama (mother),"Shiva Shankar, Bhuvan Chand",Heera Singh Khatri,
1,1,1966,Maitighar (Birthplace[women's]),"Mala Sinha, C.P. Lohani",B.S. Thapa,
2,2,1967,"Hijo, Aaja, Bholi (Yesterday, Today, Tomarrow)","Mitra Lal Sharma, Bhuvan Chand",Heera Singh Khatri,


In [4]:
df=df.drop(columns='Unnamed: 0')

### NULL Values

In [5]:
df.isnull().sum()

Year          0
Film          1
Cast         82
Director     96
Genre       452
dtype: int64

In [6]:
df=df[~(df['Year']=='Notes')]
df=df[~df['Genre'].isin(['Rishi Raj Acharya', 'Rim Bishwokarma'])]

### Cleaning year column

In [7]:
df['Year'].unique()

array(['1964', '1966', '1967', '1971', '1973', '1977', '1978', '1980',
       '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988',
       '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996',
       '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012',
       'April 2013', '2013', 'March 2014', '2014', 'November 14',
       'December 12', '2015', 'September 2016', '2016 October 24', '2018',
       '31 August 2018', '14 Dec 2018', '7 Dec 2018', '8 June 2018',
       'May 24, 2019', 'Sep 13 2019', 'December 6, 2019', 'Feb 7 2020',
       'Feb 21 2020', 'Feb 24 2020', 'March 14 2020', '2020', '2023',
       '2016', '2017', '2019'], dtype=object)

In [8]:
from dateutil import parser
def extract_year(date_str):
    parsed_date=parser.parse(date_str)
    return parsed_date.year

In [9]:
df['Year']=df['Year'].apply(extract_year)

### Cleaning movie names

In [10]:
df['Film'].unique()

array(['Aama (mother)', "Maitighar (Birthplace[women's])",
       'Hijo, Aaja, Bholi (Yesterday, Today, Tomarrow)',
       'Parivartan (Change)', "Mann Ko Bandh (Heart's stoppage)",
       'Kumari (Living goddess)', "Paral Ko Aago (Straw's Flame)",
       'Sindoor (Marriage symbol)', 'Jeevan Rekha (Life Line)',
       'Bansuri (Flute)', 'Juni (Incarnation)',
       'Bachana Chahane Haru (Those Who Want to Live)',
       'Badalindo Aakash (The Changing Sky)', 'Samjhana (Memories)',
       'Kanchhi (Little Sister)', 'Adarsha Nari (Exemplary Woman)',
       "Ke Ghar Ke Dera (What's Home, What's Tenancy)",
       'Kusume Rumal (Scarlet Handkerchief)', 'Basudev [\nlord Vishnu]',
       'Biswas (Faith)', 'Saino (Relation)', 'Anyay (Injustice)',
       'Jhodaa (Pair)', 'Maya Preeti (Love and Affection)',
       'Sahas (Courage)', 'Behuli (The Bride)',
       'Bhagya Rekha (Line of Fate)', 'Santaan (Offspring)',
       'Lahure (The Armyman)', 'Mayalu (Beloved)', 'Pariwar (Family)',
       'Che

In [11]:
def clean_movie_name(movie_name):
    # Remove text inside square brackets and parentheses
    cleaned_name = re.sub(r'\[[^]]*\]|\([^)]*\)', '', movie_name)
    
    # Remove unknown symbols and change to lowercase
    cleaned_name = ''.join(e for e in cleaned_name if e.isalnum() or e.isspace())
    cleaned_name = cleaned_name.lower()
    
    return cleaned_name.strip()  # Remove leading/trailing spaces

In [12]:
df['Film']=df['Film'].apply(clean_movie_name)

In [13]:
movie_counts=df['Film'].value_counts()

In [15]:
movies_more_than_once = movie_counts[movie_counts > 1].index.values

In [19]:
df[df['Film'].isin(movies_more_than_once)]

Unnamed: 0,Year,Film,Cast,Director,Genre
35,1991,chino,"Bhuwan K.C., Shiv Shrestha, Kristi Mainali, Sh...",Tulsi Ghimire,
49,1992,bhauju,"Rajesh Hamal, Karishma, Neer Shah, Karishma Ma...",Rajendra Shalav,
59,1994,bhauju,"Rajesh Hamal, Karishma Manandhar",Rajendra Shalabh,
93,1995,jwala,Rajesh Hamal,Mukunda Bastakoti,
110,1996,bandhan,"Rajesh Hamal, Karishma Manandhar, Melina Manan...",Resh Raj Acharya,
112,1996,chhori buhari,"Rajesh Hamal, Pooja Chand",Laxminath Sharma,
146,1997,bandhan,"Dinesh Sharma, Rajesh Hamal, Karishma Manandha...",Resh Raj Acharya,
150,1998,malati,"Dinesh Sharma, Shiv Shrestha",Anil Sangraula,
183,1999,chhori buhari,"Rajesh Hamal,Puja Chand, Nir Shah,Sunil Thapa",Laxmikant Sharma,
219,2001,daag,"Dinesh Sharma, Dhiren Shakya, Jal Shah, Rajani...",,


In [22]:
movies_more_than_once=np.setdiff1d(movies_more_than_once,['gaatho','mission paisa'])

In [23]:
df=df[~df['Film'].isin(movies_more_than_once)]

In [26]:
df=df.drop([432,320])

In [27]:
df.shape

(494, 5)

In [28]:
df.to_csv('data/cleaned_data.csv')

### Managing null values in cast and director column

In [31]:
df[df['Cast'].isnull()]

Unnamed: 0,Year,Film,Cast,Director,Genre
38,1991,The Friend,,Sanjay Pradhan,
74,1994,Pahilo Prem (First Love),,Chetan Karki,
79,1994,Rakchaya (Protection),,Prem Baniya,
89,1995,Dharma (Religion),,Amar Rasilee,
107,1996,Anartha (Wrong Meaning),,Kundan Khanal,
...,...,...,...,...,...
430,2015,Romance,,L N Gautam,
458,2016,Prem Geet (2016),,,"Love story, Comedy"
462,2016,Baato muniko phool 2,,,Social satire
469,2016,"Drama, Comedy",,"Sanjog Rana, Purnima Lama, Ryan Lama, Dhan Bor...",Rim Bishwokarma


In [38]:
df[df['Director'].isnull()]

Unnamed: 0,Year,Film,Cast,Director,Genre
8,1980,Jeevan Rekha (Life Line),"Meenaxi Anand, Shiv Shrestha (debut)",,
87,1994,Tuhuro (Orphan),"Bhuwan K.C., Srijana Basnet, Mausami Malla, Di...",,
92,1995,Jeevan Sangharsha (Struggles of Life),"Rajesh Hamal, Mithila Sharma",,
104,1996,"""Aafno Birano"" (Native Foreigner)","Rajesh Hamal, Shreekrishna Shrestha, Shrisha K...",,
116,1996,Laxmi Puja (Goddess Laxmi worship),"Kristi Mainali, Ganesh Upreti, Shree Krishna S...",,
...,...,...,...,...,...
417,2015,Zhigrana[13],"Hanna G, Nikun Shrestha, Menuka Pradhan, Jyoti...",,
418,2014,Jwala,"Rejina Uprety, Biraj Bhatta, Mukesh",,
422,2015,Hostel Returns,"Sushil Shrestha, Sashi Shrestha, Nazir Hussain...",,
458,2016,Prem Geet (2016),,,"Love story, Comedy"


In [40]:
df[(df['Cast'].isnull()) & (df['Director'].isnull())]

Unnamed: 0,Year,Film,Cast,Director,Genre
130,1997,Chahaari,,,
132,1997,Dauntari,,,
153,1998,Chamatkar,,,
163,1998,Pardesi,,,
190,1999,Himalaya (Caravan) official entry for the Oscars,,,
217,2001,Bihani,,,
242,2002,Anjuli,,,
245,2002,Bakshish,,,
255,2002,Mama Bhanja,,,
257,2002,Man Mandir,,,


### Genres

In [18]:
df=df[~df['Genre'].isin(['Rishi Raj Acharya', 'Rim Bishwokarma'])]

In [19]:

genre_counts=df['Genre'].value_counts()

In [20]:
genres=[]
for genre, count in genre_counts.items():
    # print(f'Genre: {genre}, Count: {count}')
    genres.extend(genre.split(','))
    

In [21]:
genres=[genre.strip() for genre in genres]
genres=set(genres)
genres=list(genres)

In [22]:
len(genres)

27

In [23]:
with open('genres.txt','w') as genres_file:
    for genre in genres:
        genres_file.write(genre+ '\n')
     