In [None]:
# -*- coding: utf-8 -*-
"""ml_pipeline_Avit_Brian_Mugisha_music_data_preprocessing.ipynb

# Data Processing Approach for Portfolio Project

## Project Title: Music Mastery (MM)

## Student Name: Avit Brian MUGISHA

---

1. **Data Sources and Aggregation:**
   - List all sources of data for the project. **You must consider sources outside Kaggle, Google datasets** (insert links where necessary to online platforms, research papers, etc.)

   - **Data Sources:** The dataset used for this project is sourced from the Music Data Collection [Insert link to dataset if available].

   - Determine if data aggregation from multiple sources is necessary for comprehensive analysis.

   - **Data Aggregation:** The dataset provides comprehensive details about artists, albums, and tracks. For this project, I will only use this single source as it covers a wide range of relevant features.
"""


In [None]:
import pandas as pd

# Load the dataset
url = 'summatives/talentai/data/talentai_artist_records.csv'
df = pd.read_csv(url)

# Summary statistics
df.head(10)


In [None]:
"""
2. **Data Format Transformation:**
   - Describe the current format of the data.
   - Outline the planned transformation to a unified format suitable for analysis and modeling.

    - **Current Format:** The dataset is structured in a tabular format with features such as artist name, genre, popularity, and number of tracks.
    - **Transformation Plan:** The data will be transformed into a format suitable for analysis by standardizing numerical features and encoding categorical variables.
"""

from sklearn.preprocessing import StandardScaler

# Separate features and target (if applicable)
X = df.drop('target_column', axis=1)  # Replace 'target_column' with your actual target column if applicable
y = df['target_column']  # Replace 'target_column' with your actual target column if applicable

# Identify numerical columns
numerical_cols = X.select_dtypes(include=['number']).columns.tolist()

# Apply scaling to numerical columns
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
X_processed = X
X_processed.head()


In [None]:
"""
3. **Data Exploration:**
   - Enumerate the features included in the dataset.
   
   - Summarize findings from exploratory data analysis (EDA) including distributions, correlations, and outliers.
    - **Features:** Features included in the dataset are: artist_name, genre, popularity, number_of_tracks, etc.
  
    - **Findings:** Conduct exploratory data analysis (EDA) to understand distributions, correlations, and outliers in the dataset. Features such as popularity and genre will be analyzed to gain insights into their relationship with the target variable (if applicable). 
"""

import matplotlib.pyplot as plt
import seaborn as sns

# Selecting features for EDA
selected_features = ['popularity', 'number_of_tracks']

# Plotting distributions for selected features
plt.figure(figsize=(12, 8))

for i, feature in enumerate(selected_features, start=1):
    plt.subplot(2, 2, i)
    sns.histplot(data=df, x=feature, kde=True, bins=20, alpha=0.7, palette='viridis')
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Count')

plt.tight_layout()
plt.show()


"""
4. **Hypothesis Testing:**
   - State any preexisting hypotheses about the data.
   - Explain methodologies to empirically test these hypotheses.

    - **Hypothesis 1:** Popularity is positively correlated with the number of tracks an artist has.
    - **Hypothesis 2:** Genre distribution impacts the overall popularity of the artist.
"""


In [None]:
"""
5. **Handling Sparse/Dense Data and Outliers:**
   - Assess the density of the data.
   - Propose strategies to handle missing data and outliers while maintaining dataset integrity.

    - **Data Density:** Assess the density of missing data.
    - **Strategies:** Implement strategies such as mean imputation for missing values and IQR method for outlier detection.
"""

# Calculate quantiles for capping
popularity_cap = df['popularity'].quantile(0.99)

# Cap extreme values for selected features
X_processed['popularity'] = X_processed['popularity'].clip(upper=popularity_cap)

# Impute missing values with mean
X_processed = X_processed.fillna(X_processed.mean())

X_processed.head()


In [None]:
"""
6. **Data Splitting:**
   - Define a methodology to split the dataset into training, validation, and testing sets.
   - Ensure randomness and representativeness in each subset.
    - **Methodology:** Split the dataset into training, validation, and testing sets.
    - **Considerations:** Ensure randomness and stratification based on the target variable for representativeness (if applicable).
"""

from sklearn.model_selection import train_test_split

X = X_processed
y = df['target_column']  # Replace 'target_column' with your actual target column if applicable

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42, stratify=y_train)

# Print shapes to verify
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_val shape: {X_val.shape}, y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape} \n")


In [None]:
"""
7. **Bias Mitigation:**
   - Implement techniques to identify and mitigate biases in the dataset.
   - Ensure fairness and equity in data representation.
    - **Techniques:** SMOTE was used to address class imbalance in the target variable (if applicable).
"""

from imblearn.over_sampling import SMOTE

# Apply SMOTE to address class imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print("Resampled class distribution:", y_train_resampled.value_counts())


In [None]:
"""
8. **Features for Model Training:**
   - Identify relevant features for training the model.
   - Rank features based on their significance to project objectives.

 **Your answer for features must be plotted/ show your working code-wise **
"""

from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

# Initialize Random Forest Classifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Get feature importances
feat_importances = pd.Series(rf.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh', figsize=(8, 5))
plt.title('Top 10 Important Features')
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.show()


In [None]:
"""
9. **Types of Data Handling:**
   - Classify the types of data (categorical, numerical, etc.) present in the dataset.
   - Plan preprocessing steps for each data type.

    - **Data Types:** The dataset includes numerical and categorical data.

      - Numerical Columns:

          popularity,
          number_of_tracks,
      - Categorical Columns:

          genre,
          artist_name,
    - **Preprocessing:** Plan preprocessing steps such as encoding categorical variables and scaling numerical features.
"""

# Example preprocessing pipeline for numerical and categorical features
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Define the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)  # Define categorical_cols appropriately
    ])

# Apply preprocessing
X_processed = preprocessor.fit_transform(X)


"""
10. **Data Transformation for Modeling:**
    - Specify methods for transforming raw data into a model-friendly format.
    - Detail steps for normalization, scaling, or encoding categorical variables.

      - **Methods:** Normalize numerical features and encode categorical variables for model training.
"""

"""
11. **Data Storage:**
    - Determine where and how processed data will be stored.
    - Choose suitable storage solutions ensuring accessibility and security.
      - **Storage Solution:** Store processed data in a structured format such as CSV files or a database for accessibility and security.
"""
