In [None]:
import pandas as pd

# Load the dataset
file_path = 'psc_severity_train.csv'  # Replace with the correct file path
train_data = pd.read_csv(file_path)

# Count unique values and their frequencies in `annotation_severity`
annotation_severity_counts = train_data['annotation_severity'].value_counts(dropna=False)

print("Annotation Severity Types and Frequencies:")
print(annotation_severity_counts)


Annotation Severity Types and Frequencies:
annotation_severity
Medium              6919
Low                 6859
High                4042
Not a deficiency     150
NaN                   21
Name: count, dtype: int64


In [None]:
# Count unique values and their frequencies in `VesselGroup`
vessel_group_counts = train_data['VesselGroup'].value_counts(dropna=False)

print("VesselGroup Types and Frequencies:")
print(vessel_group_counts)


VesselGroup Types and Frequencies:
VesselGroup
Dry Bulk         12830
General Cargo     2007
Container         1127
Chemical          1107
Oil                492
Ro-Ro              220
Liquefied Gas      205
Miscellaneous        3
Name: count, dtype: int64


In [None]:
# Define age ranges
age_bins = [0, 5, 10, 15, 20, 30, 40, 50, float('inf')]
age_labels = ['0-5 years', '6-10 years', '11-15 years', '16-20 years', '21-30 years', '31-40 years', '41-50 years', '50+ years']

# Classify ages into the defined ranges
train_data['age_group'] = pd.cut(train_data['age'], bins=age_bins, labels=age_labels, right=True)

# Count frequencies of each age group
age_group_counts = train_data['age_group'].value_counts()

print("Age Ranges and Frequencies:")
print(age_group_counts)


Age Ranges and Frequencies:
age_group
11-15 years    8353
6-10 years     3612
16-20 years    3002
0-5 years      1868
21-30 years    1097
31-40 years      44
41-50 years      15
50+ years         0
Name: count, dtype: int64


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Load the dataset
file_path = 'psc_severity_train.csv'  # Replace with the correct file path
train_data = pd.read_csv(file_path)

# Step 2: Preprocess the dataset
# Ensure deficiency codes are 5 digits with leading zeros if necessary
train_data['deficiency_code'] = train_data['deficiency_code'].apply(lambda x: str(int(x)).zfill(5))

# Drop rows with null/blank values in annotation_severity
train_data = train_data.dropna(subset=['annotation_severity'])

# Step 3: Extract TF-IDF keywords from def_text
vectorizer = TfidfVectorizer(max_features=50, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(train_data['def_text'])
keywords = vectorizer.get_feature_names_out()

# Analyze relationships between keywords and annotation_severity
high_indicators, low_indicators, medium_indicators, nad_indicators = [], [], [], []

for keyword in keywords:
    keyword_presence = train_data['def_text'].str.contains(keyword, case=False, na=False)
    severity_distribution = train_data.loc[keyword_presence, 'annotation_severity'].value_counts(normalize=True)

    if 'High' in severity_distribution and severity_distribution['High'] > 0.7:
        high_indicators.append(keyword)
    elif 'Low' in severity_distribution and severity_distribution['Low'] > 0.7:
        low_indicators.append(keyword)
    elif 'Medium' in severity_distribution and severity_distribution['Medium'] > 0.7:
        medium_indicators.append(keyword)
    elif 'Not a deficiency' in severity_distribution and severity_distribution['Not a deficiency'] > 0.7:
        nad_indicators.append(keyword)

# Step 4: Define VesselGroup and Age Points
vessel_points = {
    'Tanker': 10,
    'Gas Carrier': 9,
    'Container': 8,
    'Dry Bulk': 7,
    'Passenger': 6,
    'Fishing': 4,
    'General Cargo': 5,
    'Others': 3
}

def calculate_age_points(age):
    if age <= 5:
        return 2
    elif 5 < age <= 10:
        return 4
    elif 10 < age <= 15:
        return 6
    elif 15 < age <= 20:
        return 8
    elif 20 < age <= 30:
        return 10
    elif 30 < age <= 40:
        return 8
    elif 40 < age <= 50:
        return 5
    else:
        return 2  # For ages beyond 50, assign lower points due to reduced activity.

# Step 5: Tie-Breaking Logic with Weighted Point System
def resolve_ties(group):
    severities = group['annotation_severity'].tolist()
    if len(set(severities)) == 1:  # If all annotations are the same, return directly
        return severities[0]

    # Count votes for each severity
    severity_votes = pd.Series(severities).value_counts()
    max_votes = severity_votes.max()
    plurality_severities = severity_votes[severity_votes == max_votes].index.tolist()

    if len(plurality_severities) == 1:
        return plurality_severities[0]  # Plurality wins

    # If still tied, apply the point-based system
    def_text = group['def_text'].iloc[0]
    vessel_group = group['VesselGroup'].iloc[0]
    age = group['age'].iloc[0]

    scores = {'High': 0, 'Medium': 0, 'Low': 0, 'Not a deficiency': 0}

    # Add points from def_text
    scores['High'] += sum(5 for keyword in high_indicators if keyword in def_text)
    scores['Medium'] += sum(3 for keyword in medium_indicators if keyword in def_text)
    scores['Low'] += sum(1 for keyword in low_indicators if keyword in def_text)
    scores['Not a deficiency'] += sum(0 for keyword in nad_indicators if keyword in def_text)

    # Add points from VesselGroup
    if vessel_group in vessel_points:
        scores['High'] += vessel_points[vessel_group]

    # Add points from age
    scores['High'] += calculate_age_points(age)

    # Return the severity with the highest score
    return max(scores, key=scores.get)

# Step 6: Apply Consensus Logic
consensus = (
    train_data.groupby(['PscInspectionId', 'deficiency_code'])
    .apply(resolve_ties)
    .reset_index(name='predicted_severity')
)

# Step 7: Merge the predicted severity back to the original dataset
train_data = train_data.merge(consensus, on=['PscInspectionId', 'deficiency_code'])

# Step 8: Save the resulting DataFrame to a CSV file
output_file = 'resolved_predicted_severity.csv'
train_data.to_csv(output_file, index=False)

# Step 9: Display a Preview of the Resulting Dataset
print(f"Predicted severity has been saved to '{output_file}'.")
print(train_data.head(10))

from google.colab import files

# Step 10: Download the resolved CSV file
files.download('resolved_predicted_severity.csv')


  .apply(resolve_ties)


Predicted severity has been saved to 'resolved_predicted_severity.csv'.
   PscInspectionId deficiency_code  annotation_id username  \
0          1702496           01104       42180251   mihail   
1          1702496           01104       42532116     marc   
2          1702496           01104       42631723     raul   
3          1795901           10135       42190695   mihail   
4          1795901           10135       42851742     raul   
5          1795901           10135       43136020     marc   
6          1667488           10135       42224469   mihail   
7          1667488           10135       43206905     marc   
8          1667488           10135       43217322     raul   
9          1733202           10135       42224484   mihail   

  annotation_severity                                           def_text  \
0                 Low  PscInspectionId: 1702496\n\nDeficiency/Finding...   
1                High  PscInspectionId: 1702496\n\nDeficiency/Finding...   
2                

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>