In [None]:
# Importing necessary libraries
import pandas as pd # Used for data manipulation and analysis, especially with data frames
import numpy as np # Provides support for arrays, matrices, and numerical operations
import matplotlib.pyplot as plt # Used for creating static, interactive, and animated plots
import seaborn as sns # Built on top of matplotlib, seaborn provides a high-level interface for drawing attractive statistical graphics
from sklearn.model_selection import train_test_split # Used for splitting arrays or matrices into train and test subsets.
from sklearn.neighbors import KNeighborsClassifier # Used for implementing the k-nearest neighbors value.
from sklearn.metrics import confusion_matrix # Used for evaluate the accuracy of a model.
from sklearn.metrics import classification_report # Used for building a text report.

In [None]:
# Load the dataset from the specified file path
file_path = r'E:\ML\global youtube statistics.csv' # Define the path to the CSV file
df = pd.read_csv(file_path, encoding='ISO-8859-1') # Read the CSV file into a pandas DataFrame

In [None]:
# 1. Initial Inspection
print("Dataset Overview:")
# Display the first 5 rows of the dataset to get a quick look at the data
print(df.head(), "\n")
print("Dataset Info:")
# Provide detailed information about the dataset including the number of entries, columns, data types, and non-null counts
print(df.info(), "\n")
print("Data Types:")
# Show the data types of each column to identify if any columns need type conversion
print(df.dtypes, "\n")

In [None]:
# 2. Missing Values and Duplicates
print("Missing Values:")
# Display the count of missing (null) values for each column in the dataset
print(df.isnull().sum(), "\n")
print(f"Number of duplicate rows: {df.duplicated().sum()}\n")
# Display the number of duplicate rows in the dataset

In [None]:
# Fill missing values
for col in df.columns:
    if df[col].dtype == 'object':
        df[col].fillna(df[col].mode()[0],inplace=True)
    else:
        df[col].fillna(df[col].mean(),inplace=True)
# Check for any remaining null values
print(df.isnull().sum())

In [None]:
# 3. Descriptive Statistics
print("Descriptive Statistics:")
# Get numeric columns only (int64 and float64 types)
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
# Generate descriptive statistics summary for numeric columns
stats_summary = df.describe().T
# Calculate the range for each column (max - min)
stats_summary['range'] = stats_summary['max'] - stats_summary['min']
# Calculate the median for each numeric column
stats_summary['median'] = df[num_cols].median()
# Calculate the mode for each numeric column (the most frequent value)
stats_summary['mode'] = df[num_cols].mode().iloc[0]
# Print the summary statistics
print(stats_summary, "\n")

In [None]:
# 4. Visualization: Histograms for Numerical Features
# Plot histograms for all numeric features
df[num_cols].hist(figsize=(10, 8), bins=20, edgecolor='black')
plt.suptitle("Histograms of Numerical Features")
plt.show()
# Add a new column with a log-transformed version of the first numeric column (if not already present)
if 'Negative_Skew_Example' not in df.columns:
    df['Negative_Skew_Example'] = np.log1p(df[num_cols[0]])
# Get the list of columns to plot (including the newly added 'Negative_Skew_Example' column, if present)
columns_to_plot = list(num_cols)
if 'Negative_Skew_Example' in df.columns:
    columns_to_plot.append('Negative_Skew_Example')
# Loop to plot histograms for each column with mean, median, and mode overlay
for col in columns_to_plot:
    plt.figure(figsize=(8, 6))
    sns.histplot(df[col], kde=True, bins=20, color="blue", edgecolor="black", alpha=0.7)
    # Calculate mean, median, and mode for each column
    mean = df[col].mean()
    median = df[col].median()
    mode = df[col].mode()[0]
    # Add vertical lines for mean, median, and mode to the histogram
    plt.axvline(mean, color='red', linestyle='--', linewidth=2, label=f"Mean: {mean:.2f}")
    plt.axvline(median, color='green', linestyle='-', linewidth=2, label=f"Median: {median:.2f}")
    plt.axvline(mode, color='orange', linestyle='-.', linewidth=2, label=f"Mode: {mode:.2f}")
    # Add title, labels, and legend
    plt.title(f"Histogram of {col} with Mean, Median, and Mode")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.legend()
    plt.show()
    # Determine skewness based on mean and median
    if mean > median:
        skewness_type = "Positive Skew"
    elif mean < median:
        skewness_type = "Negative Skew"
    else:
        skewness_type = "No Skew"
    # Print the skewness and summary statistics
    print(f"{col}: Mean={mean:.2f}, Median={median:.2f}, Mode={mode:.2f} -> {skewness_type}")

In [None]:
# Distribution of nominal variables using count plot
if 'category' in df.columns:
    plt.figure(figsize=(12, 6))
    # Plot a count plot for the 'category' column, ordered by the frequency of each category
    sns.countplot(data=df, y='category', order=df['category'].value_counts().index, palette='viridis')
    plt.title("Count of Categories")
    plt.show()

In [None]:
# Pair plot
sns.pairplot(df[num_cols], diag_kind="kde",)
plt.suptitle("Pair Plot of Numerical Features", y=1.02)
plt.show()

In [None]:
# Heat map for Correlation
plt.figure(figsize=(8, 6)) # Create a figure with a size of 8x6 inches to ensure readability of the heatmap
# Generate the heatmap:
# df[num_cols].corr() calculates the correlation matrix for the numerical columns in the dataset
# annot=True annotates the heatmap cells with the correlation coefficient values
# cmap='coolwarm' uses a color palette from blue (negative) to red (positive) to represent the correlation strength
# fmt='.2f' formats the correlation values to two decimal places for clarity
# linewidths=0.5 adjusts the width of the lines separating cells in the heatmap, making them subtle
sns.heatmap(df[num_cols].corr(), annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title("Correlation Matrix") # Adds a title "Correlation Matrix" to the heatmap
plt.show() # Display the heatmap to the screen

In [None]:
# Define the numerical columns and the grouping column
numerical_cols = ['subscribers', 'video views', 'uploads']
grouping_col = 'category' # Grouping by category
# Create subplots for each numerical column
fig, axes = plt.subplots(1, 3, figsize=(18, 6)) # 1 row, 3 columns for the plots
# Create a boxplot for each numerical feature
for i, col in enumerate(numerical_cols):
    sns.boxplot(data=df, x=col, y=grouping_col, hue=grouping_col, palette='Set3', ax=axes[i], dodge=False)
    axes[i].set_title(f"Box Plot of {col.capitalize()} by Category", fontsize=14)
    axes[i].set_xlabel(col.capitalize(), fontsize=12)
    axes[i].set_ylabel("Category", fontsize=12)
# Adjust layout to ensure no overlapping labels
plt.tight_layout()
plt.show()

In [None]:
# One-Hot Encoding Example
import category_encoders as ce
# Sample data
data = {
    'Country': ['India', 'United States', 'United States', 'United States', 'India']
}
df = pd.DataFrame(data)
# Convert the 'Country' column to categorical dtype
df['Country'] = df['Country'].astype('category')
# One-Hot Encoding for the 'Country' column using category_encoders
encoder = ce.OneHotEncoder(cols=['Country'], use_cat_names=True)
df_encoded = encoder.fit_transform(df)
# Display the result
print("One-Hot Encoded Data using category_encoders:")
print(df_encoded.head())

In [None]:
# Dummy Encoding Example
# Sample data
data = {
    'Country': ['India', 'United States', 'United States', 'United States', 'India']
}
df = pd.DataFrame(data)
# Convert the 'Country' column to categorical dtype
df['Country'] = df['Country'].astype('category')
# Dummy Encoding for the 'Country' column using category_encoders
encoder = ce.OneHotEncoder(cols=['Country'], use_cat_names=True, drop_invariant=True)
df_encoded = encoder.fit_transform(df).iloc[:, 1:] # Drop the first column explicitly
# Display the result
print("Dummy Encoded Data using category_encoders:")
print(df_encoded.head())

In [None]:
# Ordinal Encoding Example
import pandas as pd
import category_encoders as ce
# Sample data
data = {
    'Channel_Type': [
        'Music', 'Games', 'Entertainment', 'Education', 'Entertainment',
        'Music', 'Entertainment', 'Entertainment', 'People', 'Entertainment',
        'Music', 'Sports', 'Games', 'Music', 'Music', 'Entertainment',
        'Entertainment', 'Music', 'Entertainment', 'Music', 'Music',
        'Entertainment', 'Education', 'Music', 'Education', 'Music',
        'Entertainment', 'Music', 'Sports', 'Film', 'Music', 'Music',
        'Music', 'News'
    ]
}
# Create a DataFrame
df = pd.DataFrame(data)
# Ordinal Encoding for the 'Channel_Type' column using category_encoders
encoder = ce.OrdinalEncoder(cols=['Channel_Type'])
df_encoded = encoder.fit_transform(df)
# Combine original and encoded data
df_combined = pd.concat([df, df_encoded], axis=1)
df_combined.columns = ['Channel_Type (Original)', 'Channel_Type (Encoded)']
# Display the result
print("Original Data with Ordinal Encoding:")
print(df_combined)

In [None]:
# Count Encoding Example
import pandas as pd
# Sample data
data = {
    'Channel_Type': [
        'Music', 'Games', 'Entertainment', 'Education', 'Entertainment',
        'Music', 'Entertainment', 'Entertainment', 'People', 'Entertainment',
        'Music', 'Sports', 'Games', 'Music', 'Music', 'Entertainment',
        'Entertainment', 'Music', 'Entertainment', 'Music', 'Music',
        'Entertainment', 'Education', 'Music', 'Education', 'Music',
        'Entertainment', 'Music', 'Sports', 'Film', 'Music', 'Music',
        'Music', 'News'
    ]
}
# Create a DataFrame
df = pd.DataFrame(data)
# Count the occurrences of each category
count_encoding = df['Channel_Type'].value_counts().to_dict()
# Map each category to its count value
df['Channel_Type_Count'] = df['Channel_Type'].map(count_encoding)
# Display the result without the 8421 binary encoding
print("Data with Count Encoding:")
print(df[['Channel_Type', 'Channel_Type_Count']])

In [None]:
# Binary Encoding Example
import pandas as pd
import category_encoders as ce
# Sample data
data = {
    'Channel_Type': [
        'Music', 'Games', 'Entertainment', 'Education', 'Entertainment',
        'Music', 'Entertainment', 'Entertainment', 'People', 'Entertainment',
        'Music', 'Sports', 'Games', 'Music', 'Music', 'Entertainment',
        'Entertainment', 'Music', 'Entertainment', 'Music', 'Music',
        'Entertainment', 'Education', 'Music', 'Education', 'Music',
        'Entertainment', 'Music', 'Sports', 'Film', 'Music', 'Music',
        'Music', 'News'
    ]
}
# Create a DataFrame
df = pd.DataFrame(data)
# Binary Encoding for the 'Channel_Type' column
encoder = ce.BinaryEncoder(cols=['Channel_Type'])
df_encoded = encoder.fit_transform(df)
# Combine original and encoded data
df_combined = pd.concat([df, df_encoded], axis=1)
# Display the result
print("Original Data with Binary Encoding:")
print(df_combined.head())

In [None]:
# Cell 1: Import necessary libraries and load dataset
import pandas as pd
# Load dataset
file_path = r"E:\ML\global youtube statistics.csv" # Replace with your file path
df = pd.read_csv(file_path, encoding='ISO-8859-1')
# Display the first few rows of the dataset
df.head()

In [None]:
# Label Encoding for KNN Classification
from sklearn.preprocessing import LabelEncoder
# Define categorical columns
categorical_columns = ['category', 'Country', 'Abbreviation', 'channel_type', 'Youtuber','Title']
# Initialize LabelEncoder
label_encoder = LabelEncoder()
# Apply Label Encoding to each categorical column
for column in categorical_columns:
    df[column] = label_encoder.fit_transform(df[column])
target_column = 'Validity'
# Separate features and target
X = df.drop(columns=[target_column])
y = df[target_column]
# Check the first few rows to ensure everything is numeric, including 'Youtuber'
print(X.head())

In [None]:
# Split data for KNN Classification
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets (80% train, 20% test) -test size
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
#x is independent variable #y is dependent variable
#random state is used to data is split in same way every time you run the code
#stratify is used to both training and testing have same proportions of 1 and 0

In [None]:
# KNN Classifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
# Initialize the KNN classifier
knn = KNeighborsClassifier(n_neighbors=1) # You can adjust the number of neighbors
# Fit the model on the training data
knn.fit(X_train, y_train)
#classifies data point based on the class of its closest neighbor.
#it can be sensitive to noise in the data because if the closest point is noisy or an outlier, it will heavily influence the prediction.
# it will look at the stored training data to find the nearest neighbor(s)

In [None]:
# Get model score
knn.score(X_test, y_test)
#Get accuracy. Note: In case of classification algorithms score method represents accuracy.

In [None]:
# Make predictions
y_pred = knn.predict(X_test)

In [None]:
# Confusion matrix
confusion_matrix(y_test, y_pred)
#True Positives (TP): 25 — The model correctly predicted that 50 channels exist.
#True Negatives (TN): 44 — The model correctly predicted that 100 channels do not exist.
#False Positives (FP): 16 — The model incorrectly predicted that 10 channels exist, but they do not.
#False Negatives (FN): 15— The model incorrectly predicted that 5 channels do not exist, but they actually do.

In [None]:
# Create confusion matrix as DataFrame
pd.DataFrame(confusion_matrix(y_test, y_pred), columns=['Predicted Not Valid','Predicted Validity'], index=['Actually Not Valid','Actually Valid'])

In [None]:
# Classification report
# y_test: True labels (actual values) from the test dataset.
# y_pred: Predicted labels (values) by the classification model.
print(classification_report(y_test, y_pred))

In [None]:
# Find optimal K value
error_rate = []
for i in range(1, 40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

In [None]:
# Plot error rate vs K value
plt.figure(figsize=(10, 6))
plt.plot(range(1, 40), error_rate, color='blue', linestyle='dashed', marker='o', markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

In [None]:
# Train with optimal K value
knn = KNeighborsClassifier(n_neighbors=14)
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
print(classification_report(y_test, y_pred))
print("Accuracy: ", knn.score(X_test, y_test))