In [14]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages

# Load the data
file_path = 'D:/Rajesh/Rajesh/Personal/AISanDiego/ProjectDocuments/DiabetesHealthIndicators/GitHubClone/Diabetes-Health-Indicator/data/diabetes_012_health_indicators_BRFSS2015.csv'
print("file_path = ", file_path)

diabetes_data = pd.read_csv(file_path)

# Ensure column names are clean
diabetes_data.columns = diabetes_data.columns.str.strip()

# Summarize the dataset
summary_stats = diabetes_data.describe()

# Specify the path to save the Excel file
output_path = 'D:/Rajesh/Rajesh/Personal/AISanDiego/ProjectDocuments/DiabetesHealthIndicators/summary_statistics.xlsx'

# Save summary statistics to an Excel file
summary_stats.to_excel(output_path, sheet_name='Summary_Statistics')

# Print value counts for each categorical column in a side-by-side table
categorical_summary = pd.DataFrame()
for column in diabetes_data.columns:
    if diabetes_data[column].nunique() <= 10:  # Assuming columns with <= 10 unique values are categorical
        categorical_summary[column] = diabetes_data[column].value_counts()

# print("\nValue Counts for Categorical Columns:")
# print(categorical_summary)

# Save value counts for categorical columns to the same Excel file
with pd.ExcelWriter(output_path, mode='a', engine='openpyxl') as writer:
    categorical_summary.to_excel(writer, sheet_name='Categorical_Summary')

# Create a DataFrame to store all relationship statistics
all_relationship_stats = pd.DataFrame()

for column in diabetes_data.columns:
    if column != 'Diabetes_012':
        if diabetes_data[column].nunique() > 10:  # Numerical column
            stats = diabetes_data.groupby('Diabetes_012')[column].describe().unstack()
            stats.index = [f'{column}_{idx}' for idx in stats.index]
        else:  # Categorical column
            stats = diabetes_data.groupby('Diabetes_012')[column].value_counts().unstack().fillna(0)
            stats.index = [f'{column}_{idx}' for idx in stats.index]
        all_relationship_stats = pd.concat([all_relationship_stats, stats], axis=0)

# Save all relationship statistics to the same Excel file
with pd.ExcelWriter(output_path, mode='a', engine='openpyxl') as writer:
    all_relationship_stats.to_excel(writer, sheet_name='All_Relationships')

# Specify the path to save the PDF file
pdf_output_path = 'D:/Rajesh/Rajesh/Personal/AISanDiego/ProjectDocuments/DiabetesHealthIndicators/plots.pdf'

# Create a PDF file to save the plots
with PdfPages(pdf_output_path) as pdf:

    # Plot histograms and KDE plots for all features
    num_cols = 2
    num_rows = (len(diabetes_data.columns) + 1) // num_cols

    fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(16, num_rows * 4))
    fig.suptitle('Distribution of Health Indicators')

    for i, column in enumerate(diabetes_data.columns):
        try:
            if diabetes_data[column].nunique() <= 10:  # Assuming columns with <= 10 unique values are categorical
                sns.countplot(data=diabetes_data, x=column, ax=axes[i//num_cols, i%num_cols])
                axes[i//num_cols, i%num_cols].set_xticks(sorted(diabetes_data[column].unique()))
            else:
                sns.histplot(data=diabetes_data, x=column, kde=True, ax=axes[i//num_cols, i%num_cols])
            axes[i//num_cols, i%num_cols].set_title(column)
        except Exception as e:
            print(f"Error plotting {column}: {e}")

    plt.tight_layout(rect=[0, 0, 1, 0.96])
    pdf.savefig(fig)  # Save the figure to the PDF
    plt.close(fig)

    # Box plots and count plots to compare distributions for all indicators between diabetic and non-diabetic individuals
    fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(16, num_rows * 4))
    fig.suptitle('Comparison of Health Indicators by Diabetes Status')

    columns = diabetes_data.columns
    columns = columns.drop('Diabetes_012')  # Exclude the target column

    for i, column in enumerate(columns):
        if diabetes_data[column].nunique() > 10:  # Assume it's numerical if there are more than 10 unique values
            sns.boxplot(data=diabetes_data, x='Diabetes_012', y=column, ax=axes[i//num_cols, i%num_cols])
        else:  # Assume it's categorical otherwise
            sns.countplot(data=diabetes_data, x=column, hue='Diabetes_012', palette='Set2', ax=axes[i//num_cols, i%num_cols])

        axes[i//num_cols, i%num_cols].set_title(column)

    plt.tight_layout(rect=[0, 0, 1, 0.96])
    pdf.savefig(fig)  # Save the figure to the PDF
    plt.close(fig)

    # Calculate and print correlation matrix and covariance matrix
    correlation_matrix = diabetes_data.corr()
    covariance_matrix = diabetes_data.cov()

    # Plot the correlation matrix
    fig = plt.figure(figsize=(12, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Matrix of Diabetes Attributes')
    pdf.savefig(fig)  # Save the figure to the PDF
    plt.close(fig)

    # Plot the covariance matrix
    fig = plt.figure(figsize=(12, 8))
    sns.heatmap(covariance_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Covariance Matrix of Diabetes Attributes')
    pdf.savefig(fig)  # Save the figure to the PDF
    plt.close(fig)

# Save correlation and covariance matrices to the same Excel file
with pd.ExcelWriter(output_path, mode='a', engine='openpyxl') as writer:
    correlation_matrix.to_excel(writer, sheet_name='Correlation_Matrix')
    covariance_matrix.to_excel(writer, sheet_name='Covariance_Matrix')


file_path =  D:/Rajesh/Rajesh/Personal/AISanDiego/ProjectDocuments/DiabetesHealthIndicators/GitHubClone/Diabetes-Health-Indicator/data/diabetes_012_health_indicators_BRFSS2015.csv


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Load the data
file_path = 'D:/Rajesh/Rajesh/Personal/AISanDiego/ProjectDocuments/DiabetesHealthIndicators/GitHubClone/Diabetes-Health-Indicator/data/diabetes_012_health_indicators_BRFSS2015.csv'
diabetes_data = pd.read_csv(file_path)

# Clean column names
diabetes_data.columns = diabetes_data.columns.str.strip()

# Separate features and target
X = diabetes_data.drop(columns='Diabetes_012')
y = diabetes_data['Diabetes_012']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print('ROC-AUC Score:', roc_auc_score(y_test, y_pred))

ModuleNotFoundError: No module named 'sklearn'