## Import Libraries

In [None]:

import matplotlib.pyplot as plt
from wordcloud import WordCloud
import pathlib
import os
import pandas as pd
from collections import Counter
import importlib.util

## Import Data Files

In [None]:
source_file_path = os.path.join(os.path.dirname(os.getcwd()),'parent','constants','__init__.py')


spec = importlib.util.spec_from_file_location('__init__', source_file_path)
source_file = importlib.util.module_from_spec(spec)
spec.loader.exec_module(source_file)


path=[]
for dirname, _, filenames in os.walk(os.path.join(os.path.dirname(os.getcwd()),source_file.DATASET_DIR,source_file.DATASET_PROCESSED_DIR)): 
    for filename in filenames:
        if(pathlib.Path(os.path.join(dirname, filename)).suffix =='.csv'):
           path.append(os.path.join(dirname, filename))


##### Read the imported files

In [None]:
for filename in path:
    if(os.path.basename(filename)=='spam.csv'):
        train_set=pd.read_csv(filename) 
   

## Exploratory Data Analysis(EDA)

##### At first get a brief idea of data i.e features

In [None]:
train_set.head(2)#gives first 2 rows of dataset

##### Gather insights of the data now i.e null values

In [None]:
train_set.info()  #gives datatype,count of entries i.e for checking null values in features 
print("\n","="*80,"\n")
test_set.info()
print("\n","="*80,"\n")

##### Find actual % of null values of all features now

In [None]:
#gives % of null values corresponding to all features
print(100*train_set.isnull().sum()/len(train_set))
print("\n","="*80,"\n")
print(100*test_set.isnull().sum()/len(test_set))

##### Get a better insight of data i.e mean,s.d,percentiles etc.

In [None]:
#gives a vivid insight of data i.e mean,count,max,min,std.50% etc
print(train_set.describe())
print("\n","="*80,"\n")
print(test_set.describe())

##### Make a copy of actual data

In [None]:
train_set_mod= train_set.copy()
test_set_mod= test_set.copy()

## Data Visualisation

In [None]:
# Assuming you have a pandas DataFrame 'data' with a 'SUBDIVISION' column
genre_counts = train_set_mod["GENRE"].value_counts()

# Create a bar chart
genre_counts.plot(kind='barh', figsize=(10, 8))
plt.xlabel("Count", size=12)
plt.ylabel("Genre", size=12)
plt.title("Count of Genre")
plt.grid(axis="x", linestyle="-.")
plt.show()

In [None]:
output_categories=train_set_mod['GENRE'].unique()
plt.figure(figsize=(8,8))
values = train_set_mod['GENRE'].value_counts()
plt.pie(values, labels=output_categories, autopct='%1.1f', startangle=90, radius=1.2, explode=(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.8, 0.75, 0.65, 0.55, 0.45, 0.35, 0.2, 0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.2))
plt.show()



In [None]:

genre_text = ' '.join(train_set_mod['GENRE'])
movie_name_text = ' '.join(train_set_mod['TITLE'])


genre_wordcloud = WordCloud(width=800, height=400, background_color='white').generate(genre_text)
movie_name_wordcloud = WordCloud(width=800, height=400, background_color='white').generate(movie_name_text)

# Display the word clouds
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.imshow(genre_wordcloud, interpolation='bilinear')
plt.title('Genre Word Cloud')
plt.axis('off')

plt.subplot(1, 2, 2)
plt.imshow(movie_name_wordcloud, interpolation='bilinear')
plt.title('Movie Name Word Cloud')
plt.axis('off')

plt.show()

In [None]:
# Now building a corpus which will be a 2d list with 27 rows one row for each genre's description corpus
corpus = []
for i in range(len(train_set_mod['GENRE'].unique())):
    corpus_i = []
    for desc in train_set_mod[train_set_mod['Labeled Genre'] == i]['Processed Description'].tolist():
        for word in desc.split():
            corpus_i.append(word)
    corpus.append(corpus_i)



In [None]:
fig, axs = plt.subplots(nrows=9, ncols=3,figsize=(16,30))
plt.subplots_adjust(hspace=0.75)
fig.suptitle('Bar Graphs showing the Most Common words')

for i, l in enumerate(corpus):
    colors = np.random.rand(20, 3)
    ax = axs[i//3,i%3]
    df = pd.DataFrame(Counter(l).most_common(20))
    ax.bar(df[0], df[1], color=colors)
    ax.set_title(le.inverse_transform([i])[0])

    # Setting tick positions and labels
    ax.set_xticks(np.arange(len(df[0])))
    ax.set_xticklabels(df[0].tolist(),rotation=90)

plt.xticks(rotation='vertical')
plt.show()