In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the CSV file
# Replace 'survey_data.csv' with the path to your CSV file
data = pd.read_csv('Earcare.csv')

# Display the first few rows of the dataset to understand its structure
print("Dataset preview:")
print(data.head())

# Step 1: Handle missing values
# Fill missing numerical values with the mean of the column
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns
data[numerical_columns] = data[numerical_columns].fillna(data[numerical_columns].mean())

# Fill missing categorical values with the mode of the column
categorical_columns = data.select_dtypes(include=['object']).columns
data[categorical_columns] = data[categorical_columns].fillna(data[categorical_columns].mode().iloc[0])

# Step 2: Encode categorical variables
label_encoders = {}
for column in categorical_columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le  # Save encoder for inverse transformation if needed

# Step 3: Separate features and target variable
# Replace 'target_column_name' with the name of your target column (e.g., 'Ear Health')
X = data.drop('RISK', axis=1)
y = data['RISK']

# Step 4: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display shapes of resulting datasets
print(f"Training features shape: {X_train.shape}")
print(f"Testing features shape: {X_test.shape}")
print(f"Training labels shape: {y_train.shape}")
print(f"Testing labels shape: {y_test.shape}")
print(X_train)


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
data = pd.read_csv('Earcare.csv')

# Handle missing values
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns
data[numerical_columns] = data[numerical_columns].fillna(data[numerical_columns].mean())

categorical_columns = data.select_dtypes(include=['object']).columns
for column in categorical_columns:
    data[column] = data[column].fillna(data[column].mode().iloc[0])

# Encode categorical variables
label_encoders = {}
for column in categorical_columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le
X = data.drop('RISK', axis=1)
y = data['RISK']

smote = SMOTE(sampling_strategy='auto', random_state=42, k_neighbors=4)
X_synthetic, y_synthetic = smote.fit_resample(X, y)

synthetic_data = pd.DataFrame(X_synthetic, columns=X.columns)
synthetic_data['RISK'] = y_synthetic
synthetic_data = synthetic_data.sample(n=1000, replace=True, random_state=42)
synthetic_data = synthetic_data.reset_index(drop=True)
synthetic_data.to_csv('synthetic_earcare_data.csv', index=False)
print(f"Generated {len(synthetic_data)} rows of synthetic data")


Generated 1000 rows of synthetic data




In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the CSV file
data = pd.read_csv('Earcare.csv')

# Handle missing values
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns
data[numerical_columns] = data[numerical_columns].fillna(data[numerical_columns].mean())

# Separate features and target variable
X = data.drop('RISK', axis=1)
y = data['RISK']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Display the results
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

# Feature importance
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': rf_classifier.feature_importances_})
feature_importance = feature_importance.sort_values('importance', ascending=False)
print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))


Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        94
           1       1.00      1.00      1.00       106

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200


Top 10 Most Important Features:
                                              feature  importance
7   How often do you take breaks while using headp...    0.205922
10  While Using Headphones/earphones are you able ...    0.166321
4   What activities do you usually use headphones ...    0.111099
14  Do you take any precautions to protect your he...    0.083601
5      At what volume level do you typically listen?     0.073467
6   Do you increase the volume when in noisy envir...    0.071524
8   Do you use any volume-limiting features on you...    0.057354
1                                              Gender    0.046197
3   On av

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load and preprocess the data (using the same steps as before)
data = pd.read_csv('Earcare.csv')

# Handle missing values
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns
data[numerical_columns] = data[numerical_columns].fillna(data[numerical_columns].mean())

# Separate features and target variable
X = data.drop('RISK', axis=1)
y = data['RISK']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the XGBoost Classifier
xgb_classifier = XGBClassifier(random_state=42)
xgb_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Display the results
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

# Feature importance
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': xgb_classifier.feature_importances_})
feature_importance = feature_importance.sort_values('importance', ascending=False)
print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))


Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        94
           1       1.00      1.00      1.00       106

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200


Top 10 Most Important Features:
                                              feature  importance
10  While Using Headphones/earphones are you able ...    0.428018
7   How often do you take breaks while using headp...    0.242474
14  Do you take any precautions to protect your he...    0.167438
4   What activities do you usually use headphones ...    0.130577
13  Do you experience any of the following after u...    0.012794
6   Do you increase the volume when in noisy envir...    0.011168
1                                              Gender    0.005059
5      At what volume level do you typically listen?     0.002380
9   Do yo

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

# Load and preprocess the data
data = pd.read_csv('Earcare.csv')

# Handle missing values
data = data.fillna(data.mode().iloc[0])

# Encode categorical variables
le = LabelEncoder()
for column in data.select_dtypes(include=['object']):
    data[column] = le.fit_transform(data[column])

# Separate features and target
X = data.drop('RISK', axis=1)
y = data['RISK']

# Create a pipeline with preprocessing and SVM classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(kernel='rbf', random_state=42))
])

# Perform 5-fold cross-validation
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')

# Output the cross-validation results
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean accuracy: {cv_scores.mean():.2f} (+/- {cv_scores.std() * 2:.2f})")


Cross-validation scores: [1. 1. 1. 1. 1.]
Mean accuracy: 1.00 (+/- 0.00)
