In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

# Load the dataset
data = pd.read_excel("/content/19CSE305_LabData_Set3.1.xlsx", sheet_name="thyroid0387_UCI")

# Determine data types (Nominal or Ordinal)
data_types = {}
for column in data.columns:
    unique_values = data[column].unique()
    data_type = "Ordinal" if len(unique_values) > 10 else "Nominal"
    data_types[column] = data_type

# Print data types
for column, data_type in data_types.items():
    print("Attribute:", column)
    print("Datatype:", data_type)

# Encode data based on data types (Employ Label encoding or One-hot encoding)
data_encodings = {}
for column in data.columns:
    encoding_type = "Employ Label encoding" if data_types[column] == "Ordinal" else "One-hot encoding"
    data_encodings[column] = encoding_type

# Print data encodings
for column, encoding_type in data_encodings.items():
    print("Attribute:", column)
    print("Encoding Type:", encoding_type)
    print()

# Analyze numeric columns
numeric_columns = data.select_dtypes(include=['number'])

# Calculate data range for numeric variables
data_ranges = numeric_columns.apply(lambda x: x.max() - x.min())
print("Data Range for Numeric Variables:")
print(data_ranges)

# Check for missing values in each column
missing_values = data.isna().sum()
print("Missing Values per Column:")
print(missing_values)

# Detect outliers using Z-score
z_scores = np.abs(stats.zscore(data['age']))
threshold = 100
outliers = np.where(z_scores > threshold)
print("Outliers Detected:", outliers)

# Calculate mean and standard deviation for numeric variables
mean_values = numeric_columns.mean()
std_deviation = numeric_columns.std()
print("Mean of Numeric Values:", mean_values)
print("Standard Deviation of Numeric Values:", std_deviation)

# Handle missing values by replacing '?' with appropriate values
columns_with_missing_values = ['TSH', 'T3', 'T4U']
for column in columns_with_missing_values:
    if data[column].dtype == 'float64' or data[column].dtype == 'int64':
        data[column].replace({'?': data[column].mean()}, inplace=True)
    elif data[column].dtype == 'object':
        data[column].replace({'?': data[column].mode()[0]}, inplace=True)
    else:
        data[column].replace({'?': data[column].median()}, inplace=True)

# Min-Max scaling for numeric columns
min_range = 0
max_range = 1
min_values = numeric_columns.min()
max_values = numeric_columns.max()
min_max_scaled = (numeric_columns - min_values) / (max_values - min_values) * (max_range - min_range) + min_range
data[numeric_columns.columns] = min_max_scaled

# Calculate Jaccard and Simple Matching coefficients between two vectors
def jaccard_coefficient(vector1, vector2):
    intersection = len(set(vector1) & set(vector2))
    union = len(set(vector1) | set(vector2))
    return intersection / union

def simple_matching_coefficient(vector1, vector2):
    intersection = len(set(vector1) & set(vector2))
    union = len(set(vector1) | set(vector2))
    return intersection / union

# Load the dataset for similarity calculation
df_similarity = pd.read_excel("/content/19CSE305_LabData_Set3.1.xlsx", sheet_name="thyroid0387_UCI")
vector1 = df_similarity.iloc[0]
vector2 = df_similarity.iloc[1]
jaccard_coeff = jaccard_coefficient(vector1, vector2)
simple_matching_coeff = simple_matching_coefficient(vector1, vector2)

print("Jaccard Coefficient:", jaccard_coeff)
print("Simple Matching Coefficient:", simple_matching_coeff)

# Calculate Cosine Similarity between two vectors
from sklearn.metrics.pairwise import cosine_similarity
vector1 = df_similarity.iloc[1:2]
vector2 = df_similarity.iloc[1:2]
cosine_sim = cosine_similarity(vector1, vector2)
print("Cosine Similarity between Vector 1 and Vector 2:", cosine_sim[0][0])

# Calculate Jaccard and Simple Matching coefficient matrices and visualize as heatmaps
vectors = df_similarity.iloc[:20]
jc_matrix = [[jaccard_coefficient(vector1, vector2) for vector2 in vectors] for vector1 in vectors]
smc_matrix = [[simple_matching_coefficient(vector1, vector2) for vector2 in vectors] for vector1 in vectors]

# Plot Jaccard Coefficient Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(jc_matrix, annot=True, fmt=".2f", cmap="YlGnBu", xticklabels=False, yticklabels=False)
plt.title("Jaccard Coefficient Heatmap")
plt.show()

# Plot Simple Matching Coefficient Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(smc_matrix, annot=True, fmt=".2f", cmap="YlGnBu", xticklabels=False, yticklabels=False)
plt.title("Simple Matching Coefficient Heatmap")
plt.show()
