In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score

###  1. Loading and Exploring the Dataset

In [None]:
df=pd.read_csv("raw_data\musicdata.csv")
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
#droping irreleveant column i.e Unnamed

df=df.drop(columns=["Unnamed: 0"])
df.head()

In [None]:
num_cols=df.select_dtypes(include=["int64",'float64']).columns
num_cols

In [None]:
cat_cols=df.select_dtypes(include=["object"]).columns
cat_cols

### 2. Handle Missing Values

In [None]:
df.isnull().sum()

In [None]:
df[["Track Name","Artists","Album Name"]]=df[["Track Name","Artists","Album Name"]].fillna("Unkonwn")

In [None]:
df.isnull().sum()[df.isnull().sum()>0]

### 3. Exploratory Data Analysis (EDA)

In [None]:
#calculating correlation for df 
correlation_matrix = df.corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, cmap='coolwarm', linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()


### Scatter Plot

In [None]:
# scatter plots for each numerical columns vs Popularity

num_col=['Danceability', 'Energy', 'Loudness', 'Valence','Duration (ms)','Key', 'Mode', 'Speechiness', 'Acousticness', 
         'Instrumentalness','Liveness','Tempo']

plt.figure(figsize=(16, 16))
for i, num in enumerate(num_col, 1):
    plt.subplot(4, 3, i)
    sns.scatterplot(x=df[num], y=df['Popularity'])
    plt.title(f'{num} vs Popularity')
    plt.xlabel(num)
    plt.ylabel('Popularity')

plt.tight_layout()
plt.show()


### Histogram

In [None]:
# Histograms for Popularity 

sns.histplot(df["Popularity"],bins=30)
plt.title('Distribution of Popularity Scores')
plt.xlabel('Popularity')
plt.ylabel('Frequency')
plt.show()

In [None]:
#'Explicit' is a column indicating whether the song is explicit (1) or not (0)

sns.boxplot(x=df['Explicit'], y=df['Popularity'])
plt.title('Popularity vs Explicit Content')
plt.xlabel('Explicit Content (0 = False, 1 = True)')
plt.ylabel('Popularity')
plt.show()


### 4. Prepare the Data for Modeling

In [None]:
# Convert Explicit (Boolean) into an integer (0 or 1).

df["Explicit"] = df["Explicit"].astype(int)

In [None]:
# Define key features for modeling
features = ['Danceability', 'Energy', 'Loudness', 'Valence', 'Acousticness', 'Instrumentalness', 'Liveness', 'Speechiness', 
            'Tempo', 'Duration (ms)', 'Key', 'Mode', 'Explicit']

# Target variable
target = 'Popularity'

In [None]:
# Initialize StandardScaler
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[features])

# Convert back to a DataFrame
df_scaled = pd.DataFrame(df_scaled, columns=features)
df[features] = df_scaled

In [None]:
df.head()

### 5. Split the Data for Training and Testing

In [None]:
X=df[features]
y=df["Popularity"]

# Split the data into training and testing sets
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
x_train.head()

In [None]:
print(len(x_train),len(y_train))

In [None]:
print(len(x_test),len(y_test))

### 6. Build a Predictive Model

In [None]:
model=LinearRegression()

In [None]:
model.fit(x_train,y_train)

In [None]:
y_pred=model.predict(x_test)

In [None]:
y_pred

### 7. Evaluate the Model

In [None]:
r2=r2_score(y_test,y_pred)
mse=mean_squared_error(y_test,y_pred)

In [None]:
print("R2 Score : ",r2)
print("mean_squared_error : ",mse)

In [None]:
# Get coefficients
coefficients = model.coef_

In [None]:
# Create a DataFrame for better visualization
coef_df = pd.DataFrame({'Feature': features, 'Coefficient': coefficients})
print(coef_df)

**As this model performs poorly, further improvements are needed.**

### 8. Generate Insights and Recommendations

**Key Insights on Track Popularity**
- Danceability, energy, and loudness contribute significantly to popularity.
- High instrumentalness tends to lower popularity.
- Speechiness, tempo, and valence also impact popularity.

**How Artists or Producers Can optimize tracks for higher popularity** 
- Increase danceability and energy with rhythmic beats.
- Ensure appropriate loudness levels in mastering.
- Balance speechiness based on the track's style.
- Reduce instrumental-only sections for broader appeal.

**Model Improvement Suggestions**
- Use advanced models like Random Forest, XGBoost, or Neural Networks for better accuracy.
- Improve feature selection and data preprocessing.
- Optimize hyperparameters using GridSearchCV.