In [1]:
import pandas as pd

def infer_and_convert_data_types(df):
    for col in df.columns:
        # Attempt to convert to numeric first
        df_converted = pd.to_numeric(df[col], errors='coerce')
        if not df_converted.isna().all():  # If at least one value is numeric
            df[col] = df_converted
            continue

        # Attempt to convert to datetime
        try:
            df[col] = pd.to_datetime(df[col])
            continue
        except (ValueError, TypeError):
            pass

        # Check if the column should be categorical
        if len(df[col].unique()) / len(df[col]) < 0.5:  # Example threshold for categorization
            df[col] = pd.Categorical(df[col])

    return df

# Test the function with your DataFrame
df = pd.read_csv('sample_data.csv')
print("Data types before inference:")
print(df.dtypes)

Data types before inference:
Name         object
Birthdate    object
Score        object
Grade        object
dtype: object


In [13]:
import pandas as pd

def infer_and_convert_dtypes(file_path):
    # Load the data
    if file_path.endswith('.csv'):
        df = pd.read_csv(file_path)
    elif file_path.endswith('.xlsx'):
        df = pd.read_excel(file_path)
    else:
        raise ValueError("Unsupported file format. Please provide a CSV or Excel file.")
    
    # Infer and convert column data types
    for column in df.columns:
        col_data = df[column]
        
        # Attempt date conversion
        try:
            df[column] = pd.to_datetime(col_data, errors='coerce')
            if df[column].notna().sum() > 0:
                continue
        except (ValueError, TypeError):
            pass

        # Attempt integer conversion if no missing values and values are close to integers
        if col_data.dropna().apply(lambda x: x.isnumeric() if isinstance(x, str) else False).all():
            df[column] = pd.to_numeric(col_data, downcast='integer', errors='coerce')
            continue
        
        # Attempt float conversion
        try:
            df[column] = pd.to_numeric(col_data, errors='coerce')
            if df[column].notna().sum() > 0:
                continue
        except ValueError:
            pass

        # Attempt Boolean conversion
        if col_data.dropna().isin([True, False, 'True', 'False']).all():
            df[column] = col_data.astype('bool')
            continue

        # Convert text columns with limited unique values to category
        if col_data.nunique() < 0.5 * len(col_data) and col_data.dtype == 'object':
            df[column] = col_data.astype('category')
    
    return df

df_converted = infer_and_convert_dtypes('sample_data.csv')
print(df_converted.dtypes)

Name                float64
Birthdate    datetime64[ns]
Score               float64
Grade              category
dtype: object


  df[column] = pd.to_datetime(col_data, errors='coerce')
  df[column] = pd.to_datetime(col_data, errors='coerce')
  df[column] = pd.to_datetime(col_data, errors='coerce')
