# Import packages and Data

In [1]:
import pandas as pd
#import matplotlib
#from tabulate import tabulate

In [2]:
df = pd.read_excel(r'C:\Users\robert.everitt\OneDrive - National Grid\Data Engineering Course\Data\VI_E0309B_Test.xlsx')
df.columns = df.columns.str.strip()

In [3]:
df_importance = pd.read_excel(r'C:\Users\robert.everitt\OneDrive - National Grid\Data Engineering Course\Data\Importance_Scores.xlsx')
df_importance.columns = df_importance.columns.str.strip()

In [4]:
df_Script_Versions = pd.read_excel(r'C:\Users\robert.everitt\OneDrive - National Grid\Data Engineering Course\Data\Script_Versions.xlsx')
#df_Script_Versions = df_Script_Versions.columns.str.strip()

df_Script_Versions = pd.DataFrame(df_Script_Versions)

In [5]:
print(df_Script_Versions)

       Script ID  Script Version Number
0       RVARECAA                      3
1   RVI_BIDEFECT                      2
2     RVI_DEFECT                      7
3     RVI_GANDEF                      4
4   RVI_TCDEFECT                      1
5   RVI_TRDEFECT                      2
6      RVIABCBAA                      8
7        RVIBHAA                      7
8      RVIBUSHAA                      1
9       RVICSEAA                     10
10       RVICTAA                      8
11      RVIDISAA                      8
12      RVIEATAA                      2
13       RVIESAA                      7
14      RVIGCBAA                      7
15       RVIGZAA                      5
16      RVIMSCAA                      4
17       RVINEAA                      4
18      RVIOCBAA                      5
19       RVIQBAA                      1
20     RVIREACAA                      2
21       RVISAAA                      5
22      RVISGTAA                      8
23      RVISPIAA                      6


In [6]:
##### Check Script versions in dataset #####

# Define Functions

In [7]:
def clean_dataframe(df, search_text):
    """
    Find the first row containing 'search_text', remove rows above it, and set it as column names.
    """
    # Find the index of the first occurrence of the search text
    idx = df[df.apply(lambda row: row.astype(str).str.contains(search_text, case=False).any(), axis=1)].index
    
    if idx.empty:
        raise ValueError(f"'{search_text}' not found in the DataFrame.")
    
    # Get the first matching index
    first_match_idx = idx[0]
    
    # Update the DataFrame by removing rows above the match
    df = df.iloc[first_match_idx:].reset_index(drop=True)
    
    # Set the first row as column names and drop it from the DataFrame
    df.columns = df.iloc[0]
    df = df[1:].reset_index(drop=True)
    
    return df

In [8]:
def check_consistent_dates(df, reference_col, date_col):
    """
    Checks if a given reference number always corresponds to the same date
    in a Pandas DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.
        reference_col (str): The name of the column containing reference numbers.
        date_col (str): The name of the column containing dates.

    Returns:
        dict: A dictionary indicating consistency.
              - If all consistent: {'consistent': True}
              - If inconsistent: {'consistent': False, 'violating_references': list_of_references}
    """
    # Ensure the date column is in datetime format for accurate comparison
    df[date_col] = pd.to_datetime(df[date_col])

    # Group by the reference column and count the number of unique dates for each reference
    date_counts = df.groupby(reference_col)[date_col].nunique()

    # Identify reference numbers that have more than one unique date
    violating_references = date_counts[date_counts > 1].index.tolist()

    if not violating_references:
        return {'consistent': True}
    else:
        return {'consistent': False, 'violating_references': violating_references}

In [9]:
def get_max_script_versions(df):
    # Get unique Script IDs
    unique_script_ids = df['Script ID'].unique()
    
    # Initialize a dictionary to store results
    max_versions = {}
    
    # Loop through each unique Script ID
    for script_id in unique_script_ids:
        # Filter the DataFrame for the current Script ID
        filtered_df = df[df['Script ID'] == script_id]
        
        # Find the maximum Script Version Number
        max_version = filtered_df['Script Version Number'].max()
        
        # Store the result in the dictionary
        max_versions[script_id] = max_version
    
    # Convert the dictionary to a DataFrame
    result_df = pd.DataFrame(list(max_versions.items()), columns=['Script ID', 'Script Version Number'])
    
    return result_df


# Main Code

In [10]:
#*Main code*

df_clean = clean_dataframe(df,'Location Code')

In [11]:
# Define column types

date_columns = ['Inspection Date']
int_columns = ['Response Score', 'Script Item Number']
string_columns = ['Location Code','Plant Number','Installed Component','Installed Modifier','Script ID', 
                  'Script Description','Attribute ID','Questions','Response Description',
                  'Script Activity ID','Script Result ID','Site','Work Group','Zone','Script Version Number']

In [12]:
# Convert column types
for col in date_columns:
    df_clean[col] = pd.to_datetime(df_clean[col], format='%d/%m/%Y', errors='coerce')

for col in int_columns:
    df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

for col in string_columns:
    df_clean[col] = df_clean[col].astype(str, errors='ignore')

In [13]:
# Combine all columns into one list
selected_columns = date_columns + int_columns + string_columns

# Select only the specified columns
df_reduced = df_clean[selected_columns]

# Display the reduced DataFrame
df_reduced.head()

Unnamed: 0,Inspection Date,Response Score,Script Item Number,Location Code,Plant Number,Installed Component,Installed Modifier,Script ID,Script Description,Attribute ID,Questions,Response Description,Script Activity ID,Script Result ID,Site,Work Group,Zone,Script Version Number
0,2022-08-01,1.0,1,SUND4,SUND4X30VT,VT,R,RVIVTAA,VT S/S VISUAL INSPECTION,VIGEN01,Is the asset available?,YES,S-0659829,SRR-10328009,SUND,TCNWANG,TCN,1
1,2022-08-01,,2,SUND4,SUND4X30VT,VT,R,RVIVTAA,VT S/S VISUAL INSPECTION,VIGEN02,Visually inspect general condition of the asse...,,S-0659829,SRR-10328010,SUND,TCNWANG,TCN,1
2,2022-08-01,0.0,4,SUND4,SUND4X30VT,VT,R,RVIVTAA,VT S/S VISUAL INSPECTION,VIGEN03,Do you want to report any condition deteriorat...,NO,S-0659829,SRR-10328011,SUND,TCNWANG,TCN,1
3,2022-08-01,,5,SUND4,SUND4X30VT,VT,R,RVIVTAA,VT S/S VISUAL INSPECTION,VIGEN04,Please take a wide photograph of the asset,,S-0659829,SRR-10328012,SUND,TCNWANG,TCN,1
4,2022-08-01,1.0,1,SUND4,SUND4X30VT,VT,Y,RVIVTAA,VT S/S VISUAL INSPECTION,VIGEN01,Is the asset available?,YES,S-0659838,SRR-10328029,SUND,TCNWANG,TCN,1


In [14]:
result_df = get_max_script_versions(df_reduced)

#result_df.head()

In [15]:
print(type(result_df))
print(type(df_Script_Versions))


<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [16]:
result_df['Script Version Number'] = pd.to_numeric(result_df['Script Version Number'], errors='coerce')

In [17]:
print(df_Script_Versions)


       Script ID  Script Version Number
0       RVARECAA                      3
1   RVI_BIDEFECT                      2
2     RVI_DEFECT                      7
3     RVI_GANDEF                      4
4   RVI_TCDEFECT                      1
5   RVI_TRDEFECT                      2
6      RVIABCBAA                      8
7        RVIBHAA                      7
8      RVIBUSHAA                      1
9       RVICSEAA                     10
10       RVICTAA                      8
11      RVIDISAA                      8
12      RVIEATAA                      2
13       RVIESAA                      7
14      RVIGCBAA                      7
15       RVIGZAA                      5
16      RVIMSCAA                      4
17       RVINEAA                      4
18      RVIOCBAA                      5
19       RVIQBAA                      1
20     RVIREACAA                      2
21       RVISAAA                      5
22      RVISGTAA                      8
23      RVISPIAA                      6


In [18]:
# Merge dataframes on 'Script ID'
merged_df = pd.merge(result_df, df_Script_Versions, on='Script ID', suffixes=('_df1', '_df2'))

# Check for version discrepancies
discrepancies = merged_df[merged_df['Script Version Number_df1'] > merged_df['Script Version Number_df2']]

# Get the list of Script IDs where the condition is true
script_ids_with_issues = discrepancies['Script ID'].tolist()

# Output the result

if not script_ids_with_issues:
    print("The list of script IDs with issues is empty.")
else:
    print("The following script IDs have version issues.", script_ids_with_issues)


The list of script IDs with issues is empty.


In [19]:
print(merged_df.dtypes)


Script ID                    object
Script Version Number_df1     int64
Script Version Number_df2     int64
dtype: object


In [20]:
print(merged_df)

     Script ID  Script Version Number_df1  Script Version Number_df2
0      RVIVTAA                          1                          6
1     RVISPIAA                          5                          6
2     RVIDISAA                          1                          8
3      RVIESAA                          1                          7
4     RVICSEAA                          1                         10
5      RVICTAA                          1                          8
6    RVIBUSHAA                          1                          1
7     RVIGCBAA                          1                          7
8      RVISAAA                          1                          5
9      RVIBHAA                          1                          7
10    RVIVCTAA                          1                          4
11   RVIABCBAA                          1                          8
12     RVINEAA                          1                          4
13  RVSBUILDAA                    

In [21]:
# Create a copy of df_reduced to avoid SettingWithCopyWarning
df_reduced_copy = df_reduced.copy()

# Call the function with the copied DataFrame
result_consistent = check_consistent_dates(df_reduced_copy, 'Script Activity ID', 'Inspection Date')


In [22]:
if result_consistent['consistent']:
    print("Result: All reference numbers consistently have the same date.")
else:
    print(f"Result: The following reference numbers have inconsistent dates: {result_consistent['violating_references']}")
    print("\nRows with inconsistent dates:")
    print(df[df['reference_number'].isin(result_consistent['violating_references'])].sort_values(by='reference_number'))

print("\n" + "="*50 + "\n")

print("Columns in DataFrame:", df_reduced.columns)

Result: All reference numbers consistently have the same date.


Columns in DataFrame: Index(['Inspection Date', 'Response Score', 'Script Item Number',
       'Location Code', 'Plant Number', 'Installed Component',
       'Installed Modifier', 'Script ID', 'Script Description', 'Attribute ID',
       'Questions', 'Response Description', 'Script Activity ID',
       'Script Result ID', 'Site', 'Work Group', 'Zone',
       'Script Version Number'],
      dtype='object', name=0)


In [23]:
# Adding a new column with the first 5 characters of 'original_column'
df_reduced.loc[:,'5Char_Site_Code'] = df_reduced['Location Code'].str[:5]

df_reduced.loc[:,'Civil Item'] = df_reduced['Attribute ID'].str.contains('CIV').replace({True: 'Civil Item', False: None})

df_reduced.head

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reduced.loc[:,'5Char_Site_Code'] = df_reduced['Location Code'].str[:5]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reduced.loc[:,'Civil Item'] = df_reduced['Attribute ID'].str.contains('CIV').replace({True: 'Civil Item', False: None})


<bound method NDFrame.head of 0    Inspection Date  Response Score  Script Item Number Location Code  \
0         2022-08-01             1.0                   1         SUND4   
1         2022-08-01             NaN                   2         SUND4   
2         2022-08-01             0.0                   4         SUND4   
3         2022-08-01             NaN                   5         SUND4   
4         2022-08-01             1.0                   1         SUND4   
...              ...             ...                 ...           ...   
9741      2022-08-15            60.0                  32         BRIN2   
9742      2022-08-15             NaN                  33         BRIN2   
9743      2022-08-15             0.0                  34         BRIN2   
9744      2022-08-15             1.0                  36         BRIN2   
9745      2022-08-15             0.0                  37         BRIN2   

0    Plant Number Installed Component Installed Modifier Script ID  \
0      SUND

In [24]:
# Specify the columns to check
check_column = 'Civil Item'  # Replace with the name of the column to check for 'Civil Item'
target_column = 'Attribute ID'  # Replace with the name of the column to check for 'VIGEN05'

In [25]:
# Initialize a list to store the indices of matches
matches = []

In [26]:
# Iterate through the DataFrame
for i in range(len(df_reduced) - 1):  # -1 to avoid index out of range
    if df_reduced[check_column].iloc[i] == 'Civil Item':
        # Start checking the next rows for 'VIGEN05'
        j = i + 1
        while j < len(df_reduced) and df_reduced[target_column].iloc[j] == 'VIGEN05':
            matches.append(j)  # Store the index of 'VIGEN05'
            j += 1  # Move to the next row

for row in matches:
    df_reduced.at[row, check_column] = 'Civil Photo'

df_civ_photos = df_reduced[df_reduced['Civil Item'].notna()]

df_civ_photos['Attribute ID'].head

<bound method NDFrame.head of 11      VICIVCN01
13      VICIVMT01
14        VIGEN05
15        VIGEN05
16        VIGEN05
          ...    
9727    VICIVCN02
9728      VIGEN05
9734    VICIVGA09
9735    VICIVGA12
9736    VICIVGA12
Name: Attribute ID, Length: 2621, dtype: object>

In [27]:
df_scores = df_civ_photos[df_civ_photos['Civil Item'] == 'Civil Item']


df_scores.head()

Unnamed: 0,Inspection Date,Response Score,Script Item Number,Location Code,Plant Number,Installed Component,Installed Modifier,Script ID,Script Description,Attribute ID,Questions,Response Description,Script Activity ID,Script Result ID,Site,Work Group,Zone,Script Version Number,5Char_Site_Code,Civil Item
11,2022-08-01,0.0,7,SUND4,SUND4X30VT,VT,B,RVIVTAA,VT S/S VISUAL INSPECTION,VICIVCN01,Rate condition of the foundation / base,"NO SIGNIFICANT DEFECT, AS GOOD AS NEW",S-0659897,SRR-10328410,SUND,TCNWANG,TCN,1,SUND4,Civil Item
13,2022-08-01,20.0,13,SUND4,SUND4X30VT,VT,B,RVIVTAA,VT S/S VISUAL INSPECTION,VICIVMT01,Rate corrosion on the steel asset support stru...,G2-COATING INTACT (<1%)/VERY LIGHT CORROSION,S-0659897,SRR-10328412,SUND,TCNWANG,TCN,1,SUND4,Civil Item
19,2022-08-01,0.0,26,SUND4,SUND4X30VT,VT,B,RVIVTAA,VT S/S VISUAL INSPECTION,VICIVGA09,Do any of the earthing connections need reporting,NO,S-0659897,SRR-10328416,SUND,TCNWANG,TCN,1,SUND4,Civil Item
20,2022-08-01,0.0,31,SUND4,SUND4X30VT,VT,B,RVIVTAA,VT S/S VISUAL INSPECTION,VICIVGA12,Does the asset body need reporting,NO,S-0659897,SRR-10328417,SUND,TCNWANG,TCN,1,SUND4,Civil Item
21,2022-08-01,0.0,45,SUND4,SUND4X30VT,VT,B,RVIVTAA,VT S/S VISUAL INSPECTION,VICIVGA12,Does the asset body need reporting,NO,S-0659897,SRR-10328418,SUND,TCNWANG,TCN,1,SUND4,Civil Item


In [28]:
# Define the valid response scores
valid_scores = [0, 20, 40, 60, 80, 100]

# Filter the DataFrame
df_filtered = df_scores[df_scores['Response Score'].isin(valid_scores)]

df_filtered.head()


Unnamed: 0,Inspection Date,Response Score,Script Item Number,Location Code,Plant Number,Installed Component,Installed Modifier,Script ID,Script Description,Attribute ID,Questions,Response Description,Script Activity ID,Script Result ID,Site,Work Group,Zone,Script Version Number,5Char_Site_Code,Civil Item
11,2022-08-01,0.0,7,SUND4,SUND4X30VT,VT,B,RVIVTAA,VT S/S VISUAL INSPECTION,VICIVCN01,Rate condition of the foundation / base,"NO SIGNIFICANT DEFECT, AS GOOD AS NEW",S-0659897,SRR-10328410,SUND,TCNWANG,TCN,1,SUND4,Civil Item
13,2022-08-01,20.0,13,SUND4,SUND4X30VT,VT,B,RVIVTAA,VT S/S VISUAL INSPECTION,VICIVMT01,Rate corrosion on the steel asset support stru...,G2-COATING INTACT (<1%)/VERY LIGHT CORROSION,S-0659897,SRR-10328412,SUND,TCNWANG,TCN,1,SUND4,Civil Item
19,2022-08-01,0.0,26,SUND4,SUND4X30VT,VT,B,RVIVTAA,VT S/S VISUAL INSPECTION,VICIVGA09,Do any of the earthing connections need reporting,NO,S-0659897,SRR-10328416,SUND,TCNWANG,TCN,1,SUND4,Civil Item
20,2022-08-01,0.0,31,SUND4,SUND4X30VT,VT,B,RVIVTAA,VT S/S VISUAL INSPECTION,VICIVGA12,Does the asset body need reporting,NO,S-0659897,SRR-10328417,SUND,TCNWANG,TCN,1,SUND4,Civil Item
21,2022-08-01,0.0,45,SUND4,SUND4X30VT,VT,B,RVIVTAA,VT S/S VISUAL INSPECTION,VICIVGA12,Does the asset body need reporting,NO,S-0659897,SRR-10328418,SUND,TCNWANG,TCN,1,SUND4,Civil Item


In [29]:
# Further filter the DataFrame
df_final = df_filtered[~df_filtered['Response Description'].isin(['YES', 'NO', 'Non-Metallic']) & 
    df_filtered['Response Description'].notnull()
]

df_final.head()

Unnamed: 0,Inspection Date,Response Score,Script Item Number,Location Code,Plant Number,Installed Component,Installed Modifier,Script ID,Script Description,Attribute ID,Questions,Response Description,Script Activity ID,Script Result ID,Site,Work Group,Zone,Script Version Number,5Char_Site_Code,Civil Item
11,2022-08-01,0.0,7,SUND4,SUND4X30VT,VT,B,RVIVTAA,VT S/S VISUAL INSPECTION,VICIVCN01,Rate condition of the foundation / base,"NO SIGNIFICANT DEFECT, AS GOOD AS NEW",S-0659897,SRR-10328410,SUND,TCNWANG,TCN,1,SUND4,Civil Item
13,2022-08-01,20.0,13,SUND4,SUND4X30VT,VT,B,RVIVTAA,VT S/S VISUAL INSPECTION,VICIVMT01,Rate corrosion on the steel asset support stru...,G2-COATING INTACT (<1%)/VERY LIGHT CORROSION,S-0659897,SRR-10328412,SUND,TCNWANG,TCN,1,SUND4,Civil Item
31,2022-08-01,0.0,7,SUND4,SUND4X303,DIS,,RVIDISAA,DISC S/S VISUAL INSPECTION,VICIVCN01,Rate condition of the foundation / base,"NO SIGNIFICANT DEFECT, AS GOOD AS NEW",S-0659910,SRR-10328696,SUND,TCNWANG,TCN,1,SUND4,Civil Item
33,2022-08-01,0.0,11,SUND4,SUND4X303,DIS,,RVIDISAA,DISC S/S VISUAL INSPECTION,VICIVCN02,Rate condition of the concrete asset support s...,"NO SIGNIFICANT DEFECT, AS GOOD AS NEW",S-0659910,SRR-10328698,SUND,TCNWANG,TCN,1,SUND4,Civil Item
81,2022-08-02,0.0,7,SUND4,SUND4X124,DIS,,RVIDISAA,DISC S/S VISUAL INSPECTION,VICIVCN01,Rate condition of the foundation / base,"NO SIGNIFICANT DEFECT, AS GOOD AS NEW",S-0661012,SRR-10335318,SUND,TCNWANG,TCN,1,SUND4,Civil Item


Explanation:\
Negation with ~: The ~ operator is used to negate the condition, meaning we want to exclude rows where 'Response Description' is in the specified list.\
notnull() Method: The notnull() method is used to ensure that we only keep rows where 'Response Description' is not null.\
Combining Conditions: The conditions are combined using the & operator to ensure both criteria are met.\
The resulting DataFrame df_final will contain only the rows that meet both filtering criteria.

In [30]:
##import importance scores

##add them to df final based on script description and question

In [31]:
# Concatenating two columns into a new column with a space in between
df_final.loc[:,'Script and Question'] = df_final['Script Description'] + ' ' + df_final['Questions']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final.loc[:,'Script and Question'] = df_final['Script Description'] + ' ' + df_final['Questions']


In [32]:
df_importance.head()

Unnamed: 0,Script and Question,Importance score
0,ABCB S/S VISUAL INSPECTION Rate condition of t...,75.0
1,ABCB S/S VISUAL INSPECTION Rate condition of t...,82.5
2,ABCB S/S VISUAL INSPECTION Rate corrosion on t...,75.0
3,ABCB S/S VISUAL INSPECTION Rate corrosion on t...,75.0
4,B/HOUSE S/S VISUAL INSPECTION Rate condition o...,17.5


In [33]:
# Merging df_final with df_importance based on 'Script and Question'
df_final = df_final.merge(df_importance[['Script and Question', 'Importance score']], 
                          on='Script and Question', 
                          how='left')


In [34]:
df_final.head()

Unnamed: 0,Inspection Date,Response Score,Script Item Number,Location Code,Plant Number,Installed Component,Installed Modifier,Script ID,Script Description,Attribute ID,...,Script Activity ID,Script Result ID,Site,Work Group,Zone,Script Version Number,5Char_Site_Code,Civil Item,Script and Question,Importance score
0,2022-08-01,0.0,7,SUND4,SUND4X30VT,VT,B,RVIVTAA,VT S/S VISUAL INSPECTION,VICIVCN01,...,S-0659897,SRR-10328410,SUND,TCNWANG,TCN,1,SUND4,Civil Item,VT S/S VISUAL INSPECTION Rate condition of the...,82.5
1,2022-08-01,20.0,13,SUND4,SUND4X30VT,VT,B,RVIVTAA,VT S/S VISUAL INSPECTION,VICIVMT01,...,S-0659897,SRR-10328412,SUND,TCNWANG,TCN,1,SUND4,Civil Item,VT S/S VISUAL INSPECTION Rate corrosion on the...,75.0
2,2022-08-01,0.0,7,SUND4,SUND4X303,DIS,,RVIDISAA,DISC S/S VISUAL INSPECTION,VICIVCN01,...,S-0659910,SRR-10328696,SUND,TCNWANG,TCN,1,SUND4,Civil Item,DISC S/S VISUAL INSPECTION Rate condition of t...,85.0
3,2022-08-01,0.0,11,SUND4,SUND4X303,DIS,,RVIDISAA,DISC S/S VISUAL INSPECTION,VICIVCN02,...,S-0659910,SRR-10328698,SUND,TCNWANG,TCN,1,SUND4,Civil Item,DISC S/S VISUAL INSPECTION Rate condition of t...,75.0
4,2022-08-02,0.0,7,SUND4,SUND4X124,DIS,,RVIDISAA,DISC S/S VISUAL INSPECTION,VICIVCN01,...,S-0661012,SRR-10335318,SUND,TCNWANG,TCN,1,SUND4,Civil Item,DISC S/S VISUAL INSPECTION Rate condition of t...,85.0


In [35]:
df_final['WCS_Element'] = 1-(df_final['Importance score']/100)*(df_final['Response Score']/100)
df_final['Component_Q_ScriptActID'] = df_final['Installed Component'] + df_final['Questions'] + df_final['Script Activity ID']



In [36]:
df_final.head()

Unnamed: 0,Inspection Date,Response Score,Script Item Number,Location Code,Plant Number,Installed Component,Installed Modifier,Script ID,Script Description,Attribute ID,...,Site,Work Group,Zone,Script Version Number,5Char_Site_Code,Civil Item,Script and Question,Importance score,WCS_Element,Component_Q_ScriptActID
0,2022-08-01,0.0,7,SUND4,SUND4X30VT,VT,B,RVIVTAA,VT S/S VISUAL INSPECTION,VICIVCN01,...,SUND,TCNWANG,TCN,1,SUND4,Civil Item,VT S/S VISUAL INSPECTION Rate condition of the...,82.5,1.0,VT Rate condition of the foundation / baseS-06...
1,2022-08-01,20.0,13,SUND4,SUND4X30VT,VT,B,RVIVTAA,VT S/S VISUAL INSPECTION,VICIVMT01,...,SUND,TCNWANG,TCN,1,SUND4,Civil Item,VT S/S VISUAL INSPECTION Rate corrosion on the...,75.0,0.85,VT Rate corrosion on the steel asset support s...
2,2022-08-01,0.0,7,SUND4,SUND4X303,DIS,,RVIDISAA,DISC S/S VISUAL INSPECTION,VICIVCN01,...,SUND,TCNWANG,TCN,1,SUND4,Civil Item,DISC S/S VISUAL INSPECTION Rate condition of t...,85.0,1.0,DISRate condition of the foundation / baseS-06...
3,2022-08-01,0.0,11,SUND4,SUND4X303,DIS,,RVIDISAA,DISC S/S VISUAL INSPECTION,VICIVCN02,...,SUND,TCNWANG,TCN,1,SUND4,Civil Item,DISC S/S VISUAL INSPECTION Rate condition of t...,75.0,1.0,DISRate condition of the concrete asset suppor...
4,2022-08-02,0.0,7,SUND4,SUND4X124,DIS,,RVIDISAA,DISC S/S VISUAL INSPECTION,VICIVCN01,...,SUND,TCNWANG,TCN,1,SUND4,Civil Item,DISC S/S VISUAL INSPECTION Rate condition of t...,85.0,1.0,DISRate condition of the foundation / baseS-06...


In [37]:
# Create a new column to store the maximum Response Score for each Component_Q_ScriptActID
df_final['Max_Response_Score'] = None

# Loop through each row in the DataFrame
for index, row in df_final.iterrows():
    # Get the current Component_Q_ScriptActID
    current_id = row['Component_Q_ScriptActID']
    
    # Find the maximum Response Score for the current Component_Q_ScriptActID
    max_score = df_final[df_final['Component_Q_ScriptActID'] == current_id]['Response Score'].max()
    
    # Assign the maximum score to the new column
    df_final.at[index, 'Max_Response_Score'] = max_score
        

In [41]:
df_final['WCS'] = 1- df_final.groupby('Component_Q_ScriptActID')['WCS_Element'].transform('prod')

df_final.head()

Unnamed: 0,Inspection Date,Response Score,Script Item Number,Location Code,Plant Number,Installed Component,Installed Modifier,Script ID,Script Description,Attribute ID,...,Zone,Script Version Number,5Char_Site_Code,Civil Item,Script and Question,Importance score,WCS_Element,Component_Q_ScriptActID,Max_Response_Score,WCS
0,2022-08-01,0.0,7,SUND4,SUND4X30VT,VT,B,RVIVTAA,VT S/S VISUAL INSPECTION,VICIVCN01,...,TCN,1,SUND4,Civil Item,VT S/S VISUAL INSPECTION Rate condition of the...,82.5,1.0,VT Rate condition of the foundation / baseS-06...,0.0,0.0
1,2022-08-01,20.0,13,SUND4,SUND4X30VT,VT,B,RVIVTAA,VT S/S VISUAL INSPECTION,VICIVMT01,...,TCN,1,SUND4,Civil Item,VT S/S VISUAL INSPECTION Rate corrosion on the...,75.0,0.85,VT Rate corrosion on the steel asset support s...,20.0,0.15
2,2022-08-01,0.0,7,SUND4,SUND4X303,DIS,,RVIDISAA,DISC S/S VISUAL INSPECTION,VICIVCN01,...,TCN,1,SUND4,Civil Item,DISC S/S VISUAL INSPECTION Rate condition of t...,85.0,1.0,DISRate condition of the foundation / baseS-06...,0.0,0.0
3,2022-08-01,0.0,11,SUND4,SUND4X303,DIS,,RVIDISAA,DISC S/S VISUAL INSPECTION,VICIVCN02,...,TCN,1,SUND4,Civil Item,DISC S/S VISUAL INSPECTION Rate condition of t...,75.0,1.0,DISRate condition of the concrete asset suppor...,0.0,0.0
4,2022-08-02,0.0,7,SUND4,SUND4X124,DIS,,RVIDISAA,DISC S/S VISUAL INSPECTION,VICIVCN01,...,TCN,1,SUND4,Civil Item,DISC S/S VISUAL INSPECTION Rate condition of t...,85.0,1.0,DISRate condition of the foundation / baseS-06...,0.0,0.0


In [None]:
data_xyz = pd.DataFrame(df_final['Component_Q_ScriptActID'].unique())

data_xyz.head()


Unnamed: 0,0
0,VT Rate condition of the foundation / baseS-06...
1,VT Rate corrosion on the steel asset support s...
2,DISRate condition of the foundation / baseS-06...
3,DISRate condition of the concrete asset suppor...
4,DISRate condition of the foundation / baseS-06...


In [42]:
df_final.to_excel(r'C:\Users\robert.everitt\OneDrive - National Grid\Data Engineering Course\WCS_Test.xlsx')

In [None]:
### add in code to get *max* reposce score for a particular question (script activity id + script + question) before calulating WCS to avoid counting mulitple fence panels (for example) as different defects