In [None]:
import pandas as pd

# Load your data
data = pd.read_excel("btp_list.xlsx")
print(data.head())  # Print the first few rows of the DataFrame
print(data.dtypes)  # Check the data types to see what pandas is interpreting them as


In [None]:
import pandas as pd
import numpy as np

# Load your data
data = pd.read_excel("btp_list.xlsx")

# Function to process a string of numbers separated by commas
def process_number_list(cell):
    try:
        # If the cell is not a string (already a numeric type), return it as is
        if not isinstance(cell, str):
            return cell
        # Split the string on commas
        numbers = cell.split(", ")
        # Convert each to float and compute the average
        numbers = [float(num) for num in numbers]
        return np.std(numbers)
    except Exception as e:
        print(f"Error processing cell: {cell} with error {e}")
        return np.nan  # Return NaN for problematic conversions

# Columns that need processing (assuming all these might have the string lists)
columns = ["shoulder_midpoints", "head_turn_angles", "left_hand", "right_hand"]

# Apply the processing function to each relevant column
for column in columns:
    data[column] = data[column].apply(process_number_list)

# Check results
print(data.head())


In [None]:
# Compute the correlation matrix
correlation_matrix = data[['shoulder_midpoints', 'head_turn_angles', 'left_hand', 'right_hand']].corr()

# Display the correlation matrix
print("Correlation Matrix:")
print(correlation_matrix)


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Drop rows with any NaN values in specified columns
cleaned_data = data.dropna(subset=['shoulder_midpoints', 'head_turn_angles', 'left_hand', 'right_hand'])

# Standardizing the features
x = cleaned_data[['shoulder_midpoints', 'head_turn_angles', 'left_hand', 'right_hand']].values
x = StandardScaler().fit_transform(x)

# Applying PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data=principalComponents, columns=['Principal Component 1', 'Principal Component 2'])

# Combining with other data for a comprehensive view
finalDf = pd.concat([principalDf, cleaned_data[['youtube_video_code', 'category']]], axis=1)

# Print results
print(finalDf.head())


In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Imputing missing values
imputer = SimpleImputer(strategy='mean')  # Can also use median or most_frequent
imputed_data = imputer.fit_transform(data[['shoulder_midpoints', 'head_turn_angles', 'left_hand', 'right_hand']])

# Standardizing the features
x = StandardScaler().fit_transform(imputed_data)

# Applying PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data=principalComponents, columns=['Principal Component 1', 'Principal Component 2'])

# Combining with other data for a comprehensive view
finalDf = pd.concat([principalDf, data[['youtube_video_code', 'category']].reset_index(drop=True)], axis=1)

# Print results
print(finalDf.head())


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,6))
plt.scatter(finalDf['Principal Component 1'], finalDf['Principal Component 2'])
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA Result Plot')
plt.grid(True)
plt.show()
