## Code For Data **Cleaning**

In [18]:
import pandas as pd
from google.colab import files

# Upload the file
uploaded = files.upload()

# Get the original file name
file_name = list(uploaded.keys())[0]

# Read the CSV file
data = pd.read_csv(file_name)

# Replace empty values in 'Test Temp (°C)' column with 25°C
data['Test Temp (°C)'].fillna(25, inplace=True)

# Replace missing phases with 'Austenite + Ferrite'
data['Phases'].fillna('Austenite + Ferrite', inplace=True)

# Replace missing strain rates with 0.001
data['Strain Rate (s⁻¹)'].fillna(0.001, inplace=True)

# Define columns to replace with averages
columns_to_average = ['YS (MPa)', 'UTS (MPa)', 'Hardness (HV)', 'Dislocation Density (m⁻²)', 'Strain Hardening Exponent (n)', 'Strain Hardening Coefficient (K, MPa)', 'Grain Size (µm)', 'Elongation (%)']

# Calculate averages for each column
column_averages = {}
for column in columns_to_average:
    column_averages[column] = data[column].mean()

# Round the averages as specified
rounded_averages = {
    'YS (MPa)': round(column_averages['YS (MPa)'], 0),
    'UTS (MPa)': round(column_averages['UTS (MPa)'], 0),
    'Hardness (HV)': round(column_averages['Hardness (HV)'], 0),
    'Dislocation Density (m⁻²)': round(column_averages['Dislocation Density (m⁻²)'], 2),
    'Strain Hardening Exponent (n)': round(column_averages['Strain Hardening Exponent (n)'], 2),
    'Strain Hardening Coefficient (K, MPa)': round(column_averages['Strain Hardening Coefficient (K, MPa)'], 0),
    'Grain Size (µm)': round(column_averages['Grain Size (µm)'], 1),
    'Elongation (%)': round(column_averages['Elongation (%)'], 1)
}

# Replace missing values with rounded averages
for column, average in rounded_averages.items():
    data[column].fillna(average, inplace=True)

# Generate the cleaned file name
cleaned_file_name = f"{file_name.split('.')[0]}_cleaned.csv"

# Save the cleaned DataFrame to a new CSV file
data.to_csv(cleaned_file_name, index=False)

# Download the cleaned CSV file
files.download(cleaned_file_name)


Saving Raw_Data_Claude.csv to Raw_Data_Claude.csv


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Test Temp (°C)'].fillna(25, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Phases'].fillna('Austenite + Ferrite', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are s

TypeError: Could not convert string '580610565590600595575605560585590615555595600570590605580590-595565585620575600570590585550625595580545595630560598557618582603548602568588540635572592553608585598558600566593578' to numeric

# Code for **mean, median, std deviation, min** and **max** for all numeric fields

In [21]:
import pandas as pd
from google.colab import files

# Define the file names
file_names = [
    'Raw_Data_ChatGPT_cleaned.csv',
    'Raw_Data_Claude_cleaned.csv',
    'Raw_Data_Gemini_cleaned.csv',
    'Raw_Data_Perplexity_cleaned.csv'
]

# Loop through each file
for file_name in file_names:
    try:
        # Load the cleaned data file
        data = pd.read_csv(file_name)

        # Select numeric fields
        numeric_data = data.select_dtypes(include=['float64', 'int64'])

        # Calculate statistics
        statistics = {
            'mean': numeric_data.mean(),
            'median': numeric_data.median(),
            'std_dev': numeric_data.std(),
            'min': numeric_data.min(),
            'max': numeric_data.max()
        }

        # Convert statistics to a DataFrame for better readability
        stats_df = pd.DataFrame(statistics)

        # Display the statistics for the current file
        print(f"Statistics for {file_name}:")
        print(stats_df)
        print("\n")

    except FileNotFoundError:
        print(f"File {file_name} not found.")


Statistics for Raw_Data_ChatGPT_cleaned.csv:
                                               mean        median  \
Test Temp (°C)                         1.933333e+02  2.000000e+02   
Grain Size (µm)                        8.768333e+00  8.750000e+00   
Dislocation Density (m⁻²)              1.156604e+14  1.156604e+14   
YS (MPa)                               4.650000e+02  4.680000e+02   
UTS (MPa)                              7.202167e+02  7.200000e+02   
Hardness (HV)                          2.848500e+02  2.850000e+02   
Elongation (%)                         2.117667e+01  2.130000e+01   
Strain Rate (s⁻¹)                      7.666667e-04  1.000000e-03   
Strain Hardening Exponent (n)          2.058333e-01  2.100000e-01   
Strain Hardening Coefficient (K, MPa)  1.101217e+03  1.101000e+03   

                                            std_dev           min  \
Test Temp (°C)                         1.145058e+02  2.500000e+01   
Grain Size (µm)                        1.569896e+00  6.20

# Visualisation Using **Differnt Plots**

Pair Plot

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Define the file names
file_names = [
    'Raw_Data_ChatGPT_cleaned.csv',
    'Raw_Data_Claude_cleaned.csv',
    'Raw_Data_Gemini_cleaned.csv',
    'Raw_Data_Perplexity_cleaned.csv'
]

# Loop through each file to create pair plots
for file_name in file_names:
    try:
        # Load the cleaned data file
        data = pd.read_csv(file_name)

        # Select numeric fields for the pair plot
        numeric_data = data.select_dtypes(include=['float64', 'int64'])

        # Create a pair plot
        sns.set(style="whitegrid")
        pair_plot = sns.pairplot(numeric_data, diag_kind="hist", height=2.5, aspect=1)

        # Set title for the plot
        plt.suptitle(f"Pair Plot of Key Properties for {file_name}", y=1.02, fontsize=16)

        # Save the plot as an image file
        plot_file_name = f"{file_name.split('.')[0]}_pairplot.png"
        plt.savefig(plot_file_name, bbox_inches='tight')

        # Show the plot
        plt.show()

        print(f"Pair plot saved as {plot_file_name}.")

    except FileNotFoundError:
        print(f"File {file_name} not found.")


Comparision Plot between differnet properties

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Define the file names
file_names = [
    'Raw_Data_ChatGPT_cleaned.csv',
    'Raw_Data_Claude_cleaned.csv',
    'Raw_Data_Gemini_cleaned.csv',
    'Raw_Data_Perplexity_cleaned.csv'
]

# Define the X vs Y combinations for plotting
plots = [
    ('Strain Rate (s⁻¹)', 'Grain Size (µm)'),
    ('Grain Size (µm)', 'YS (MPa)'),
    ('Grain Size (µm)', 'Elongation (%)'),
    ('Hardness (HV)', 'Grain Size (µm)')
]

# Loop through each file
for file_name in file_names:
    try:
        # Load the cleaned data file
        data = pd.read_csv(file_name)

        # Loop through each X vs Y combination
        for x, y in plots:
            # Create a sorted copy of the data
            data_sorted = data.copy()
            data_sorted.sort_values(by=x, inplace=True)

            plt.figure(figsize=(8, 6))
            sns.lineplot(data=data_sorted, x=x, y=y)
            plt.title(f'{x} vs {y} for {file_name.split(".")[0]}', fontsize=14)
            plt.xlabel(x, fontsize=12)
            plt.ylabel(y, fontsize=12)
            plt.grid(True)
            plt.tight_layout()

            # Save each plot as an image file
            plot_file_name = f"{file_name.split('.')[0]}_{x.replace(' ', '_').replace('/', '_')}_vs_{y.replace(' ', '_').replace('/', '_')}.png"
            plt.savefig(plot_file_name)

            # Display the plot
            plt.show()

        print(f"Plots for {file_name} saved successfully!")

    except FileNotFoundError:
        print(f"File {file_name} not found.")


Heatmap Plot Code

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np # Import numpy and assign it to the alias 'np'

# Load the datasets
file_names = [
    'Raw_Data_ChatGPT_cleaned.csv',
    'Raw_Data_Claude_cleaned.csv',
    'Raw_Data_Gemini_cleaned.csv',
    'Raw_Data_Perplexity_cleaned.csv'
]

dataframes = [pd.read_csv(file_name) for file_name in file_names]

# Example for comparing mean values of 'YS (MPa)'
ys_means = [df['YS (MPa)'].mean() for df in dataframes]
print("Mean YS (MPa) for each dataset:", ys_means)

# Example for comparing correlation matrices
for i, df in enumerate(dataframes):
    # Select only numeric columns for correlation calculation
    numeric_df = df.select_dtypes(include=['number'])
    corr_matrix = numeric_df.corr()
    print(f"Correlation Matrix for Dataset {i + 1}:")
    print(corr_matrix)
    plt.figure(figsize=(8, 6))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
    plt.title(f"Correlation Matrix for Dataset {i + 1}")
    plt.show()

# Stack correlation matrices
# Select only numeric columns for correlation calculation
corr_matrices = [df.select_dtypes(include=['number']).corr().values for df in dataframes]
stacked_corr = np.stack(corr_matrices) # Use np.stack


# Calculate average correlation matrix
avg_corr_matrix = np.mean(stacked_corr, axis=0)

plt.figure(figsize=(8, 6))
sns.heatmap(avg_corr_matrix, annot=True, cmap='coolwarm')
plt.title("Average Correlation Across Datasets")
plt.show()

# Example for creating a summary table
summary_table = []
for df in dataframes:
    summary_row = {
        'Dataset': df.columns[0],
        'Mean YS (MPa)': df['YS (MPa)'].mean(),
        'Median YS (MPa)': df['YS (MPa)'].median(),
        'Std YS (MPa)': df['YS (MPa)'].std()
    }
    summary_table.append(summary_row)

summary_df = pd.DataFrame(summary_table)
print(summary_df)


Comparision between Original and AI generated data

In [None]:
# Step 1: Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files

# Load the datasets
file_names = [
    'Raw_Data_ChatGPT_cleaned.csv',
    'Raw_Data_Claude_cleaned.csv',
    'Raw_Data_Gemini_cleaned.csv',
    'Raw_Data_Perplexity_cleaned.csv'
]

dataframes = [pd.read_csv(file_name) for file_name in file_names]

# Step 3: Load the uploaded CSVs
df_chatgpt = pd.read_csv('Raw_Data_ChatGPT_cleaned.csv')
df_claude = pd.read_csv('Raw_Data_Claude_cleaned.csv')
df_gemini = pd.read_csv('Raw_Data_Gemini_cleaned.csv')
df_perplexity = pd.read_csv('Raw_Data_Perplexity_cleaned.csv')
df_manual = pd.read_csv('ManualData.csv')  # Make sure your manual data is named exactly this

# Step 4: Define common properties to compare
properties = ['YS (MPa)', 'UTS (MPa)', 'Hardness (HV)', 'Elongation (%)', 'Strain Rate (s⁻¹)']

# Step 5: Create subplots for each property
plt.figure(figsize=(18, 10))

for i, prop in enumerate(properties, 1):
    plt.subplot(2, 3, i)

    plt.plot(df_manual[prop], label='Manual', marker='o')
    plt.plot(df_chatgpt[prop], label='ChatGPT', marker='x')
    plt.plot(df_claude[prop], label='Claude', marker='s')
    plt.plot(df_gemini[prop], label='Gemini', marker='^')
    plt.plot(df_perplexity[prop], label='Perplexity', marker='*')

    plt.title(prop)
    plt.xlabel('Sample Index')
    plt.ylabel(prop)
    plt.grid(True)
    plt.legend()

plt.tight_layout()
plt.show()


Comparision between **mean, median, std, min, max** of Original and AI generated Data

In [None]:
import pandas as pd
from google.colab import files

# Load the datasets
file_names = [
    'Raw_Data_ChatGPT_cleaned.csv',
    'Raw_Data_Claude_cleaned.csv',
    'Raw_Data_Gemini_cleaned.csv',
    'Raw_Data_Perplexity_cleaned.csv'
]

dataframes = [pd.read_csv(file_name) for file_name in file_names]


# Load datasets
df_chatgpt = pd.read_csv('Raw_Data_ChatGPT_cleaned.csv')
df_claude = pd.read_csv('Raw_Data_Claude_cleaned.csv')
df_gemini = pd.read_csv('Raw_Data_Gemini_cleaned.csv')
df_perplexity = pd.read_csv('Raw_Data_Perplexity_cleaned.csv')
df_manual = pd.read_csv('ManualData.csv')

# Define properties to analyze
properties = ['YS (MPa)', 'UTS (MPa)', 'Hardness (HV)', 'Elongation (%)', 'Strain Rate (s⁻¹)']
datasets = {
    'Manual': df_manual,
    'ChatGPT': df_chatgpt,
    'Claude': df_claude,
    'Gemini': df_gemini,
    'Perplexity': df_perplexity
}

# Calculate and store stats
stats_to_plot = ['mean', 'median', 'std', 'min', 'max']
stat_results = {stat: pd.DataFrame(columns=datasets.keys(), index=properties) for stat in stats_to_plot}

for name, df in datasets.items():
    for prop in properties:
        # Convert to numeric, handling errors
        try:
            df[prop] = pd.to_numeric(df[prop], errors='coerce')
        except ValueError:
            print(f"Warning: Non-numeric data found in column '{prop}' of dataset '{name}'. Replacing with NaN.")
            df[prop] = pd.to_numeric(df[prop], errors='coerce')  # Force conversion to numeric, invalid values become NaN

        stat_results['mean'].loc[prop, name] = df[prop].mean()
        stat_results['median'].loc[prop, name] = df[prop].median()
        stat_results['std'].loc[prop, name] = df[prop].std()
        stat_results['min'].loc[prop, name] = df[prop].min()
        stat_results['max'].loc[prop, name] = df[prop].max()

# ... (rest of your code) ...
# Plotting
plt.figure(figsize=(20, 20))
for i, stat in enumerate(stats_to_plot, 1):
    plt.subplot(3, 2, i)
    stat_results[stat].astype(float).plot(kind='bar', ax=plt.gca())
    plt.title(f'{stat.capitalize()} Comparison')
    plt.ylabel(stat.capitalize())
    plt.xticks(rotation=45)
    plt.grid(True)

plt.tight_layout()
plt.show()