In [None]:
import pandas as pd

# Load the dataset
file_path = 'Resources/bacteria_list_200.csv'  # Path relative to your working directory
bacteria_data = pd.read_csv(file_path)

# Display the first few rows of the dataset
bacteria_data.head()

In [None]:
# Check for missing values
missing_values = bacteria_data.isnull().sum()
print("Missing values per column:")
print(missing_values)

# Check for duplicate rows
duplicates = bacteria_data.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

In [None]:
# Remove duplicate rows
bacteria_data_cleaned = bacteria_data.drop_duplicates()

# Verify the removal of duplicates
print(f"\nNumber of rows after removing duplicates: {bacteria_data_cleaned.shape[0]}")

In [None]:
# Display basic info about the cleaned dataset
bacteria_data_cleaned.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 8))
top_families = bacteria_data_cleaned['Family'].value_counts().nlargest(15).index  # Top 15 families
sns.countplot(y='Family', data=bacteria_data_cleaned[bacteria_data_cleaned['Family'].isin(top_families)],
              order=bacteria_data_cleaned['Family'].value_counts().nlargest(15).index)
plt.title('Top 15 Bacterial Families')
plt.xlabel('Count')
plt.ylabel('Family')
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
top_locations = bacteria_data_cleaned['Where Found'].value_counts().nlargest(15).index  # Top 15 locations
sns.countplot(y='Where Found', data=bacteria_data_cleaned[bacteria_data_cleaned['Where Found'].isin(top_locations)],
              order=bacteria_data_cleaned['Where Found'].value_counts().nlargest(15).index)
plt.title('Top 15 Common Locations of Bacteria')
plt.xlabel('Count')
plt.ylabel('Where Found')
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x='Harmful to Humans', data=bacteria_data_cleaned)
plt.title('Harmfulness of Bacteria to Humans')
plt.xlabel('Harmful to Humans')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(14, 10))
top_families = bacteria_data_cleaned['Family'].value_counts().nlargest(15).index  # Top 15 families
sns.countplot(y='Family', hue='Harmful to Humans',
              data=bacteria_data_cleaned[bacteria_data_cleaned['Family'].isin(top_families)],
              order=bacteria_data_cleaned['Family'].value_counts().nlargest(15).index)
plt.title('Top 15 Bacterial Families vs Harmfulness')
plt.xlabel('Count')
plt.ylabel('Family')
plt.legend(title='Harmful to Humans')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pandas as pd

# Encode the target variable "Harmful to Humans" (1 for "Yes" and 0 for "No") using .loc to avoid warnings
bacteria_data_cleaned.loc[:, 'Harmful to Humans'] = bacteria_data_cleaned['Harmful to Humans'].map({'Yes': 1, 'No': 0})

# One-hot encode the categorical variables "Family" and "Where Found"
bacteria_data_encoded = pd.get_dummies(bacteria_data_cleaned, columns=['Family', 'Where Found'], drop_first=True)

# Separate features and target variable
X = bacteria_data_encoded.drop('Harmful to Humans', axis=1)
y = bacteria_data_encoded['Harmful to Humans']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Drop the 'Name' column from X_train and X_test if it’s not needed
X_train = X_train.drop(columns=['Name'], errors='ignore')
X_test = X_test.drop(columns=['Name'], errors='ignore')

In [None]:
print(X_train.dtypes)

In [None]:
X_train = X_train.astype(int)
X_test = X_test.astype(int)