In [187]:
import numpy as np
import pandas as pd

# Consider Pittsburgh bridge dataset which consists of 108 instances and 13 attributes. Perform the listed tasks:

### Q.1 Copy and save dataset to your local drive with name bridge.xlsx.

### Q.2 Provide header to the dataframe with given values -
```
['IDENTIF','RIVER', 'LOCATION', 'ERECTED', 'PURPOSE', 'LENGTH', 'LANES', 
  'CLEAR-G', 'T-OR-D', 'MATERIAL', 'SPAN', 'REL-L', 'TYPE']
```

**Solving Q.1 and Q.2 in one go.**

In [188]:
# Load the raw data.
df = pd.read_csv("Pittsburgh bridge dataset.csv", header=None)

In [189]:
df.to_excel("bridge.xlsx", index=False)

In [190]:
# Assign the correct headers immediately after loading.
headers = [
    'IDENTIF', 'RIVER', 'LOCATION', 'ERECTED', 'PURPOSE', 'LENGTH', 'LANES',
    'CLEAR-G', 'T-OR-D', 'MATERIAL', 'SPAN', 'REL-L', 'TYPE'
]
df.columns = headers

In [191]:
# Display the initial state of the DataFrame
print(df.head())
print(df.shape)

  IDENTIF RIVER LOCATION ERECTED   PURPOSE  LENGTH LANES CLEAR-G   T-OR-D  \
0      E1     M        3  CRAFTS   HIGHWAY       ?     2       N  THROUGH   
1      E2     A       25  CRAFTS   HIGHWAY  MEDIUM     2       N  THROUGH   
2      E3     A       39  CRAFTS  AQUEDUCT       ?     1       N  THROUGH   
3      E5     A       29  CRAFTS   HIGHWAY  MEDIUM     2       N  THROUGH   
4      E6     M       23  CRAFTS   HIGHWAY       ?     2       N  THROUGH   

  MATERIAL   SPAN REL-L  TYPE  
0     WOOD  SHORT     S  WOOD  
1     WOOD  SHORT     S  WOOD  
2     WOOD      ?     S  WOOD  
3     WOOD  SHORT     S  WOOD  
4     WOOD      ?     S  WOOD  
(108, 13)


### Q.3 List names of the column(s) along with the count of missing number of values. Here, missing values are coded with '?'.

In [192]:
# Replace all instances of '?' with NumPy's Not a Number (NaN)
df.replace('?', np.nan, inplace=True)

# Count the missing values.
missing_count = df.isna().sum()
print(missing_count)

IDENTIF      0
RIVER        0
LOCATION     1
ERECTED      0
PURPOSE      0
LENGTH      27
LANES       16
CLEAR-G      2
T-OR-D       6
MATERIAL     2
SPAN        16
REL-L        5
TYPE         2
dtype: int64


### Q.4 Find the column(s) names ending with either of the following 'N', 'H', 'S'. If the number of recorded data in these columns are less than 100, drop them.

In [193]:
# Identify columns ending with 'N', 'H', or 'S'
cols_to_check = df.filter(regex='[NHS]$').columns
print(list(cols_to_check))

['LOCATION', 'LENGTH', 'LANES', 'SPAN']


In [194]:
cols_to_drop = []
for col in cols_to_check:
    # Count non-null values
    non_null_count = df[col].notna().sum()
    print(f"Checking column '{col}': {non_null_count} non-null values.")
    if non_null_count < 100:
        cols_to_drop.append(col)

Checking column 'LOCATION': 107 non-null values.
Checking column 'LENGTH': 81 non-null values.
Checking column 'LANES': 92 non-null values.
Checking column 'SPAN': 92 non-null values.


In [195]:
# Drop the identified columns
if cols_to_drop:
    df.drop(columns=cols_to_drop, inplace=True)
    print(f"\nDropped columns: {cols_to_drop}")
else:
    print("\nNo columns met the drop criteria.")


Dropped columns: ['LENGTH', 'LANES', 'SPAN']


In [196]:
print(df.columns)

Index(['IDENTIF', 'RIVER', 'LOCATION', 'ERECTED', 'PURPOSE', 'CLEAR-G',
       'T-OR-D', 'MATERIAL', 'REL-L', 'TYPE'],
      dtype='object')


### Q.5 Drop all the row(s) which have more than or equal to 2 missing values. What is the shape of the final DataFrame?

In [197]:
print(df.shape)

(108, 10)


In [198]:
threshold = len(df.columns) - 1
df.dropna(thresh=threshold, inplace=True)

In [199]:
print(df.shape)

(102, 10)


### Q.6 Fill all the remaining missing values with the mode of each column values.

In [200]:
# Calculate the mode for each column and take the first one.
modes = df.mode().iloc[0]

In [201]:
# Fill NaN values with the calculated modes.
df.fillna(modes, inplace=True)

In [202]:
print(df.head())

  IDENTIF RIVER LOCATION ERECTED   PURPOSE CLEAR-G   T-OR-D MATERIAL REL-L  \
0      E1     M        3  CRAFTS   HIGHWAY       N  THROUGH     WOOD     S   
1      E2     A       25  CRAFTS   HIGHWAY       N  THROUGH     WOOD     S   
2      E3     A       39  CRAFTS  AQUEDUCT       N  THROUGH     WOOD     S   
3      E5     A       29  CRAFTS   HIGHWAY       N  THROUGH     WOOD     S   
4      E6     M       23  CRAFTS   HIGHWAY       N  THROUGH     WOOD     S   

   TYPE  
0  WOOD  
1  WOOD  
2  WOOD  
3  WOOD  
4  WOOD  
