In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import nltk
import os
import re
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2



# Data Preprocessing

### Data Cleaning

In [None]:
data = pd.read_csv('FinalData.csv')

In [None]:
data = data.sample(frac=1).reset_index(drop=True)


In [None]:

missing_values = data.isnull().sum()
duplicates = data.duplicated(subset='computed_key').sum()
# Check the distribution of genres
genre_distribution = data['genre'].value_counts()
data = data.drop(columns=['liveness','artist_name', 'track_name','computed_key','year','danceability','energy','key','loudness','mode',	"speechiness",	'acousticness',	'instrumentalness',	'liveness','valence', 'tempo',	'duration_ms' ,'time_signature'])


-------------

# Data Preprocessing for model training

### Working with lyrics

In [None]:
data.drop_duplicates()
data.dropna(how='all')

In [None]:
# Function to check if a value is numeric
def is_numeric(value):
    try:
        float(value)
        return True
    except ValueError:
        return False

# Columns to check for conflicting data types
columns_to_check = ['genre']

# Identify rows with conflicting data types in each column
rows_to_drop = set()

for column in columns_to_check:
    for index, value in data[column].items():
        if is_numeric(value):
            rows_to_drop.add(index)

# Drop rows with conflicting data types
cleaned_data = data.drop(rows_to_drop)

cleaned_data

### Removing Numbers, punctiations and lowercasing words

In [None]:
def rid_of_specials(words):
    new= ''
    for i in range(len(words)):
        a = re.sub('[^A-Za-z]+', ' ', words[i]).lower()
        new += a
    return new
data["lyrics"] = data["lyrics"].apply(rid_of_specials)

# Remove Stopwords

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
sw_nltk = (stopwords.words('english'))
stop_words = set(sw_nltk)

#### Remove stopwords

In [None]:
def remove_sw(x):
    x = x.split(' ')
    return  ' '.join(z for z in x if z not in stop_words)
stopped = data["lyrics"].apply(remove_sw)

#### Lemmatize words
Since Lemmatization understands / considers context and works with the english language as a whole, stemming can be disadvantageous when used in certain words. For example, one word can have different lemmas depending on how it is used. Stemming does not consider this.




In [None]:
from nltk.stem import WordNetLemmatizer
# Step 1
lemmatizer = WordNetLemmatizer()
# Step 2
lemmatized = [lemmatizer.lemmatize(i) for i in stopped]
# Step 3
prepeared_sentence = [''.join(j) for j in lemmatized]
data['Lyrics_Processed'] = prepeared_sentence
data['Lyrics_Processed']

In [None]:
import pandas as pd
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

# To make the language detection deterministic
DetectorFactory.seed = 0

# Function to detect language
def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False

# Assuming df is your dataframe
data = data[data['lyrics'].apply(is_english)]
data = data.drop(0)

In [None]:
data = data.drop(columns=['lyrics'])

In [None]:
data.to_csv('Filterd.csv', index=False)