In [6]:
import pandas as pd
import numpy as np
import io
import random

def create_dataframe_with_missing_values(csv_data_string: str) -> pd.DataFrame:
    """
    Takes a CSV string, removes the 'Profit' column, and adds random missing values 
    (NaN) to the remaining four specified features. The number of NaNs introduced 
    will be random, greater than zero (if permissible), and strictly less than 5% 
    of the total cells in these four features.
    Returns a pandas DataFrame.
    """
    
    # 1. Parse CSV input string into a pandas DataFrame
    try:
        df = pd.read_csv(io.StringIO(csv_data_string))
    except pd.errors.EmptyDataError:
        # If the input string is empty or results in an empty DataFrame initially
        return pd.DataFrame() 
    except Exception as e:
        raise ValueError(f"Error parsing CSV string: {e}")

    # 2. Define the features for NaN introduction and remove 'Profit'
    features_for_nan_introduction = ['R&D Spend', 'Administration', 'Marketing Spend', 'State']
    profit_column = 'Profit'
    
    # Check if profit column exists before trying to drop it
    if profit_column in df.columns:
        df_processed = df.drop(columns=[profit_column])
    else:
        # If 'Profit' column is already missing, work with the current df
        # but ensure it only contains the expected features for NaN introduction if possible
        df_processed = df.copy() 

    # Ensure df_processed contains only the target features if other columns were present
    # or if 'Profit' was already missing.
    # This also reorders columns to match features_for_nan_introduction if they exist.
    # If some of these features are missing from the input, this will raise a KeyError.
    # We should be robust to cases where the input might not perfectly match.
    
    # Filter to only include the features_for_nan_introduction that are actually in df_processed
    existing_features_for_nan = [col for col in features_for_nan_introduction if col in df_processed.columns]
    if not existing_features_for_nan:
        # If none of the target columns for NaN introduction exist, return the (potentially modified) df
        return df_processed 
        
    df_processed = df_processed[existing_features_for_nan]

    # 3. Calculate the number of NaNs to introduce
    num_rows = df_processed.shape[0]
    
    if num_rows == 0: # Handle empty DataFrame after processing
        return df_processed

    num_feature_cols = len(existing_features_for_nan) # Number of columns we'll actually modify
    total_cells_in_features = num_rows * num_feature_cols

    if total_cells_in_features == 0: # No cells to modify
        return df_processed

    # Constraint: number of NaNs (k) must be k < 0.05 * total_cells_in_features
    strict_upper_limit_float = total_cells_in_features * 0.05

    # Determine the maximum integer k (max_permissible_nans) that satisfies k < strict_upper_limit_float
    if strict_upper_limit_float <= 0:
        max_permissible_nans = 0
    else:
        # If strict_upper_limit_float is an integer (e.g., 10.0), k_max must be strict_upper_limit_float - 1
        # If strict_upper_limit_float is float (e.g., 9.8), k_max is floor(9.8) = 9
        if strict_upper_limit_float == np.floor(strict_upper_limit_float): 
            max_permissible_nans = int(strict_upper_limit_float) - 1
        else: 
            max_permissible_nans = int(np.floor(strict_upper_limit_float))
    
    max_permissible_nans = max(0, max_permissible_nans) # Ensure it's not negative

    # Determine the actual number of NaNs to add
    num_nans_to_add = 0
    if max_permissible_nans > 0:
        # "add missing values" implies at least one if possible.
        # "random" can apply to the quantity.
        num_nans_to_add = random.randint(1, max_permissible_nans)
    
    # 4. Introduce NaNs into the DataFrame (df_processed)
    if num_nans_to_add > 0:
        candidate_cells = []
        for r_idx in range(num_rows):
            for col_name in existing_features_for_nan: # Iterate through the columns of df_processed
                candidate_cells.append((r_idx, col_name))
        
        # Ensure we don't try to sample more NaNs than available cells
        # (although num_nans_to_add should already be less than total_cells_in_features)
        if num_nans_to_add > len(candidate_cells):
            num_nans_to_add = len(candidate_cells)

        if num_nans_to_add > 0 : # Re-check, as len(candidate_cells) could be 0 for very small inputs
            cells_for_nan = random.sample(candidate_cells, k=num_nans_to_add)
            
            for r_idx, col_name in cells_for_nan:
                df_processed.loc[r_idx, col_name] = np.nan
            
    # 5. Return the final pandas DataFrame
    return df_processed

# Provided CSV data string
csv_data = """R&D Spend,Administration,Marketing Spend,State,Profit
165349.2,136897.8,471784.1,New York,192261.83
162597.7,151377.59,443898.53,California,191792.06
153441.51,101145.55,407934.54,Florida,191050.39
144372.41,118671.85,383199.62,New York,182901.99
142107.34,91391.77,366168.42,Florida,166187.94
131876.9,99814.71,362861.36,New York,156991.12
134615.46,147198.87,127716.82,California,156122.51
130298.13,145530.06,323876.68,Florida,155752.6
120542.52,148718.95,311613.29,New York,152211.77
123334.88,108679.17,304981.62,California,149759.96
101913.08,110594.11,229160.95,Florida,146121.95
100671.96,91790.61,249744.55,California,144259.4
93863.75,127320.38,249839.44,Florida,141585.52
91992.39,135495.07,252664.93,California,134307.35
119943.24,156547.42,256512.92,Florida,132602.65
114523.61,122616.84,261776.23,New York,129917.04
78013.11,121597.55,264346.06,California,126992.93
94657.16,145077.58,282574.31,New York,125370.37
91749.16,114175.79,294919.57,Florida,124266.9
86419.7,153514.11,0,New York,122776.86
76253.86,113867.3,298664.47,California,118474.03
78389.47,153773.43,299737.29,New York,111313.02
73994.56,122782.75,303319.26,Florida,110352.25
67532.53,105751.03,304768.73,Florida,108733.99
77044.01,99281.34,140574.81,New York,108552.04
64664.71,139553.16,137962.62,California,107404.34
75328.87,144135.98,134050.07,Florida,105733.54
72107.6,127864.55,353183.81,New York,105008.31
66051.52,182645.56,118148.2,Florida,103282.38
65605.48,153032.06,107138.38,New York,101004.64
61994.48,115641.28,91131.24,Florida,99937.59
61136.38,152701.92,88218.23,New York,97483.56
63408.86,129219.61,46085.25,California,97427.84
55493.95,103057.49,214634.81,Florida,96778.92
46426.07,157693.92,210797.67,California,96712.8
46014.02,85047.44,205517.64,New York,96479.51
28663.76,127056.21,201126.82,Florida,90708.19
44069.95,51283.14,197029.42,California,89949.14
20229.59,65947.93,185265.1,New York,81229.06
38558.51,82982.09,174999.3,California,81005.76
28754.33,118546.05,172795.67,California,78239.91
27892.92,84710.77,164470.71,Florida,77798.83
23640.93,96189.63,148001.11,California,71498.49
15505.73,127382.3,35534.17,New York,69758.98
22177.74,154806.14,28334.72,California,65200.33
1000.23,124153.04,1903.93,New York,64926.08
1315.46,115816.21,297114.46,Florida,49490.75
0,135426.92,0,California,42559.73
542.05,51743.15,0,New York,35673.41
0,116983.8,45173.06,California,14681.4
"""

# Generate the modified DataFrame
modified_df = create_dataframe_with_missing_values(csv_data)

# Display the DataFrame (or parts of it)
print("Modified DataFrame head:")
print(modified_df.head())
print("\nModified DataFrame info (to see NaN counts):")
modified_df.info()

# You can also check the total number of NaNs
total_nans_in_df = modified_df.isnull().sum().sum()
print(f"\nTotal NaNs introduced in the DataFrame: {total_nans_in_df}")

# Verify the percentage of NaNs
num_rows_final = modified_df.shape[0]
num_cols_final = modified_df.shape[1] # This will be 4 if all target columns were present
total_cells_final = num_rows_final * num_cols_final
if total_cells_final > 0:
    percentage_nans = (total_nans_in_df / total_cells_final) * 100
    print(f"Percentage of NaNs in the (selected columns of the) DataFrame: {percentage_nans:.2f}%")
else:
    print("DataFrame is empty, no NaNs to calculate percentage for.")

Modified DataFrame head:
   R&D Spend  Administration  Marketing Spend       State
0  165349.20       136897.80        471784.10    New York
1  162597.70       151377.59        443898.53  California
2  153441.51       101145.55        407934.54     Florida
3  144372.41       118671.85        383199.62    New York
4  142107.34        91391.77        366168.42     Florida

Modified DataFrame info (to see NaN counts):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        46 non-null     float64
 1   Administration   49 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            46 non-null     object 
dtypes: float64(3), object(1)
memory usage: 1.7+ KB

Total NaNs introduced in the DataFrame: 9
Percentage of NaNs in the (selected columns of the) DataFrame: 4.50%


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
modified_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        46 non-null     float64
 1   Administration   49 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            46 non-null     object 
dtypes: float64(3), object(1)
memory usage: 1.7+ KB


In [11]:
df = pd.read_csv('50_Startups.csv')

In [14]:
modified_df['Profits'] = df['Profit']

In [15]:
ddf = modified_df

In [18]:
ddf.isnull().mean()*100

R&D Spend          8.0
Administration     2.0
Marketing Spend    0.0
State              8.0
Profits            0.0
dtype: float64

In [19]:
from sklearn.model_selection import train_test_split

x = ddf.iloc[:, 0:-1]
y = ddf.iloc[:, -1]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=42)

In [75]:
from sklearn.experimental import enable_iterative_imputer

from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [71]:
from sklearn.pipeline import Pipeline

tnf2 = ColumnTransformer([
    ('trf1', StandardScaler(), slice(0, 5))
])

tnf3 = ColumnTransformer([
    ('trf1', IterativeImputer(), slice(0, 5))
], remainder='passthrough')

pipes = Pipeline(steps=[
    ('tnf1', tnf1),
    ('tnf2', tnf2),
    ('tnf3', tnf3)
])

In [73]:
trfd_train = pipes.fit_transform(x_train)
trfd_test = pipes.transform(x_test)

In [83]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

clf = LinearRegression()
clf2 = DecisionTreeRegressor()

clf.fit(trfd_train, y_train)
clf2.fit(trfd_train, y_train)

pred = clf.predict(trfd_test)
pred2 = clf2.predict(trfd_test)
print(r2_score(y_test, pred))
print(r2_score(y_test, pred2))

0.916828246835651
0.8501813438676471


In [86]:
print(np.mean(cross_val_score(estimator=clf, X=trfd_train, y=y_train, scoring='r2')))
print(np.mean(cross_val_score(estimator=clf2, X=trfd_train, y=y_train, scoring='r2')))

0.9182315666537786
0.8737027955370668
