## Data Understanding

##### Column decsriptions


1. Contract Number: 66,549 unique values (slightly fewer than the total rows), with 72 missing entries.
2. Amount: 50,594 unique values, indicating a high variance in contract amounts, with 855 missing values.
3. Dates:
Various date columns have a broad range of unique values, suggesting contracts are spread over a significant timeline.
4. Sign Date, Start Date, End Date, etc., have varying numbers of unique values, hinting at different stages and lengths of contract processes.
5. Agpo Certificate Number: Only 13,606 unique values, with 57,311 missing entries, indicating that this information may not be recorded for every contract.
6. Awarded Agpo Group Id: Contains only 4 unique values (Women, Youth, All, People with Disabilities) but has a high number of missing values (58,261).
7. Terminated: Only one unique value, 1.0, across the 88 non-null entries, suggesting limited use of this column in identifying terminated contracts.
8. Financial Year: 15 unique values, representing different fiscal years.
9. Quarter: 4 unique values (Q1, Q2, Q3, Q4), with 3,495 missing entries.
10. PE Name: 498 unique values, indicating a range of entities participating in the contracts.
11. Supplier Name: 25,419 unique supplier names, suggesting substantial diversity in supplier entities.
12. No. of B.O.I: Contains only 19 unique values, with the majority of entries missing (67,866).
13. Created At: Timestamped information on record creation with 48,558 unique values and 34,028 missing values, hinting at possibly incomplete time-tracking data.

In [33]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_excel("published_contracts (3).xlsx")

In [34]:
df.head()

Unnamed: 0,Contract Number,Amount,Award Date,Tender Title,Eval Completion Date,Notification Of Award Date,Sign Date,Start Date,End Date,Agpo Certificate Number,Awarded Agpo Group Id,Created By,Terminated,Financial Year,Quarter,Tender Ref.,PE Name,Supplier Name,No. of B.O.I,Created At
0,0015.1,1962488.0,2018-09-26,"Supply of Non Pharmaceuticals (Surgical Tubes,...",2018-09-25,2018-09-26,2018-09-26,2018-09-26,2018-10-25,,,1,,2018/2019,Q1,KEMSA/ONT 06/2017-2019,Higher Education Loan Board,VIABLE DECO SOLUTIONS LIMITED,,2019-01-31 09:55:20
1,38632,257736.0,2018-10-02,"Supply of Non Pharmaceuticals (Surgical Tubes,...",2018-09-25,2018-10-04,2018-10-05,2018-10-08,2018-10-23,,,1,,2018/2019,Q2,KEMSA/ONT 06/2017-2019,Kenya Post Office Saving Bank,COMPUTERWAYS LIMITED,,2019-02-19 10:14:13
2,38631,254736.0,2018-09-25,"Supply of Non Pharmaceuticals (Surgical Tubes,...",2018-09-19,2018-10-02,2018-10-05,2018-10-12,2018-11-12,,,1,,2018/2019,Q1,KEMSA/ONT 06/2017-2019,Kenya Post Office Saving Bank,REALISTIC SYSTEMS AND TECHNOLOGIES,,2019-02-27 14:28:26
3,0189,149300.0,2019-02-19,"Supply of Non Pharmaceuticals (Surgical Tubes,...",2019-01-15,2019-02-07,2019-02-11,2019-02-11,2019-03-11,,,1,,2018/2019,Q3,KEMSA/ONT 06/2017-2019,Embu,JIMRIVER CARGO LOGISTICS LIMITED,,2019-03-15 10:03:54
4,LPO 1079.,81500.0,2018-12-27,"Supply of Non Pharmaceuticals (Surgical Tubes,...",2018-12-27,2018-12-27,2018-12-27,2018-12-27,2019-01-26,,,1,,2018/2019,Q2,KEMSA/ONT 06/2017-2019,Higher Education Loan Board,MATKY INVESTMENTS,,2019-03-17 18:11:54


In [35]:
# Shape of the dataset
df.shape

(82708, 20)

In [36]:
# Checking for null values
df.isna().sum()

Unnamed: 0,0
Contract Number,72
Amount,855
Award Date,0
Tender Title,15
Eval Completion Date,0
Notification Of Award Date,0
Sign Date,12724
Start Date,988
End Date,1012
Agpo Certificate Number,57311


In [37]:
# Checkng for duplicates
df.duplicated().sum()

141

In [38]:
import pandas as pd # Importing pandas for data manipulation

class DataUnderstanding():
    """Class that provides an understanding of a dataset"""

    def __init__(self, data=None):
        """Initialization"""
        self.df = data

    def load_data(self, path):
        """Load the data"""
        if self.df is None:

            self.df = pd.read_excel(path)
        return self.df

    def concat_data(self, other_df):
        """Concatenate the current dataframe with another dataframe vertically"""
        if self.df is not None and other_df is not None:
            self.df = pd.concat([self.df, other_df], axis=0, ignore_index=True)
        return self.df

    def understanding(self):
        """Provides insights into the dataset"""
        # Info
        print("INFO")
        print("-" * 4)
        self.df.info()

        # Shape
        print("\n\nSHAPE")
        print("-" * 5)
        print(f"Records in dataset: {self.df.shape[0]} with {self.df.shape[1]} columns.")

        # Columns
        print("\n\nCOLUMNS")
        print("-" * 6)
        print("Columns in the dataset are:")
        for idx in self.df.columns:
            print(f"- {idx}")

        # Unique Values
        print("\n\nUNIQUE VALUES")
        print("-" * 12)
        for col in self.df.columns:
            print(f"Column {col} has {self.df[col].nunique()} unique values")
            if self.df[col].nunique() < 12:
                print(f"Top unique values in {col} include:")
                for idx in self.df[col].value_counts().index:
                    print(f"- {idx}")
            print("")

        # Missing or Null Values
        print("\nMISSING VALUES")
        print("-" * 15)
        for col in self.df.columns:
            print(f"Column {col} has {self.df[col].isnull().sum()} missing values.")

        # Duplicate Values
        print("\n\nDUPLICATE VALUES")
        print("-" * 16)
        print(f"The dataset has {self.df.duplicated().sum()} duplicated records.")

# Initialize data understanding
data = DataUnderstanding()

# Load the first dataset
data_path1 = "published_contracts (3).xlsx" # The path to your Excel file
df = data.load_data(data_path1)


df.head()

Unnamed: 0,Contract Number,Amount,Award Date,Tender Title,Eval Completion Date,Notification Of Award Date,Sign Date,Start Date,End Date,Agpo Certificate Number,Awarded Agpo Group Id,Created By,Terminated,Financial Year,Quarter,Tender Ref.,PE Name,Supplier Name,No. of B.O.I,Created At
0,0015.1,1962488.0,2018-09-26,"Supply of Non Pharmaceuticals (Surgical Tubes,...",2018-09-25,2018-09-26,2018-09-26,2018-09-26,2018-10-25,,,1,,2018/2019,Q1,KEMSA/ONT 06/2017-2019,Higher Education Loan Board,VIABLE DECO SOLUTIONS LIMITED,,2019-01-31 09:55:20
1,38632,257736.0,2018-10-02,"Supply of Non Pharmaceuticals (Surgical Tubes,...",2018-09-25,2018-10-04,2018-10-05,2018-10-08,2018-10-23,,,1,,2018/2019,Q2,KEMSA/ONT 06/2017-2019,Kenya Post Office Saving Bank,COMPUTERWAYS LIMITED,,2019-02-19 10:14:13
2,38631,254736.0,2018-09-25,"Supply of Non Pharmaceuticals (Surgical Tubes,...",2018-09-19,2018-10-02,2018-10-05,2018-10-12,2018-11-12,,,1,,2018/2019,Q1,KEMSA/ONT 06/2017-2019,Kenya Post Office Saving Bank,REALISTIC SYSTEMS AND TECHNOLOGIES,,2019-02-27 14:28:26
3,0189,149300.0,2019-02-19,"Supply of Non Pharmaceuticals (Surgical Tubes,...",2019-01-15,2019-02-07,2019-02-11,2019-02-11,2019-03-11,,,1,,2018/2019,Q3,KEMSA/ONT 06/2017-2019,Embu,JIMRIVER CARGO LOGISTICS LIMITED,,2019-03-15 10:03:54
4,LPO 1079.,81500.0,2018-12-27,"Supply of Non Pharmaceuticals (Surgical Tubes,...",2018-12-27,2018-12-27,2018-12-27,2018-12-27,2019-01-26,,,1,,2018/2019,Q2,KEMSA/ONT 06/2017-2019,Higher Education Loan Board,MATKY INVESTMENTS,,2019-03-17 18:11:54


In [39]:
# Get an understanding of the dataset
data.understanding()

INFO
----
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82708 entries, 0 to 82707
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Contract Number             82636 non-null  object 
 1   Amount                      81853 non-null  float64
 2   Award Date                  82708 non-null  object 
 3   Tender Title                82693 non-null  object 
 4   Eval Completion Date        82708 non-null  object 
 5   Notification Of Award Date  82708 non-null  object 
 6   Sign Date                   69984 non-null  object 
 7   Start Date                  81720 non-null  object 
 8   End Date                    81696 non-null  object 
 9   Agpo Certificate Number     25397 non-null  object 
 10  Awarded Agpo Group Id       24447 non-null  object 
 11  Created By                  82708 non-null  int64  
 12  Terminated                  88 non-null     float64
 13  Financial Year       

#### Summary Observations:
- Data Completeness: Several columns, especially those related to specific certifications and awards (e.g., Agpo Certificate Number and Awarded Agpo Group Id), have significant missing values, which may affect analyses focused on these categories.
- Data Quality: With 141 duplicates and varying levels of missing values, some data cleaning will be required, particularly for categorical columns that are missing a large portion of data.
- Potential Areas for Analysis: Despite missing values, substantial information is available regarding contract details (e.g., Amount, PE Name, Supplier Name), dates, and contract progression stages. These can support analyses related to contract timelines, supplier diversity, and financial tracking over different periods (fiscal years and quarters).

The dataset has 124 duplicate values


Here's an analysis of each column's relevance for fraud detection, ranked by importance:
High Relevance (Critical for Fraud Detection):

Temporal Columns (Pattern Analysis):


Award Date
Eval Completion Date
Notification Of Award Date
Sign Date
Start Date
End Date
→ These can reveal suspicious patterns like:

Backdating
Unrealistic timelines
Process sequence violations




Financial Information:


Amount
→ Critical for detecting:

Split purchases
Threshold avoidance
Unusual pricing patterns
Round numbers (often indicative of fraud)




Entity Information:


Supplier Name
PE Name (Procuring Entity)
→ Important for:

Shell company detection
Multiple awards to same supplier
Cross-referencing with known fraudulent entities



Medium Relevance:

Process Identifiers:


Contract Number
Tender Ref.
→ Useful for:

Detecting duplicate contracts
Sequential pattern analysis
Reference number manipulation




Classification Fields:


Financial Year
Quarter
→ Helpful for:

End-of-period fraud patterns
Budget exhaustion patterns
Seasonal anomalies




Compliance Indicators:


Agpo Certificate Number
Awarded Agpo Group Id
→ Can indicate:

Certificate fraud
Qualification manipulation



Lower Relevance (But Still Useful):

Metadata:


Created By
Created At
No. of B.O.I
→ Can reveal:

Unusual creation patterns
User behavior anomalies




Status Indicators:


Terminated
→ May indicate:

Contract execution issues
Post-award problems



For your ML model, I recommend focusing on these key feature engineering approaches:

Temporal Features:


Time differences between dates
Process duration
Date sequence violations
End of period patterns


Financial Features:


Amount distributions
Threshold proximity
Round number detection
Split payment patterns


Entity Features:


Supplier award frequency
Entity relationships
New supplier flags
Geographic patterns


Metadata Features:


Creation patterns
User behavior patterns
Documentation completeness

## Data Cleaning

In [40]:
class DataCleaning(DataUnderstanding):
    """This class is used for data cleaning"""

    def drop_columns(self, columns):
        """Drop specified columns"""
        if self.df is not None:
            print(f"Dropping Columns: {columns}")
            self.df.drop(columns=columns, inplace=True)


    def strip_column_names(self):
        """Strip whitespace from column names"""
        if self.df is not None:
            print("Stripping whitespace from column names")
            self.df.columns = self.df.columns.str.strip()

    def convert_to_datetime(self, column_name):
        """Convert a column to datetime format"""
        if self.df is not None:
            self.df[column_name] = pd.to_datetime(self.df[column_name])

    def filter_year(self, column_name, year):
        """Filter rows based on a specific year"""
        if self.df is not None:
            self.df = self.df[self.df[column_name].dt.year == year]

    def num_duplicates(self,df):
        """Check for the total number of duplicates"""
        if self.df is not None:
            print(f"Number of duplicates in {self.df.duplicated().sum()}")
            self.df.duplicated().sum()


data = DataCleaning()

# Load the first dataset
data_path1 = 'published_contracts (3).xlsx'
data.load_data(data_path1)

# Drop duplicates
data.df.drop_duplicates(inplace=True)
# Drop irrelevant columns
irrelevant_columns = ['Agpo Certificate Number', 'Awarded Agpo Group Id', 'Terminated',
                      'No. of B.O.I']

data.drop_columns(irrelevant_columns)

# Strip column names of any leading/trailing whitespace
# data.strip_column_names()

# List of columns to convert
date_columns = ['Award Date', 'Eval Completion Date', 'Sign Date', 'Created At']


# Apply conversion
for column in date_columns:
    data.df[column] = pd.to_datetime(data.df[column], errors='coerce')  # errors='coerce' handles invalid parsing


# Stripping white spaces from the columns
df.columns = df.columns.str.strip()

# Store the final cleaned dataset in df
df = data.df

# Print the cleaned DataFrame
print(df.head())

Dropping Columns: ['Agpo Certificate Number', 'Awarded Agpo Group Id', 'Terminated', 'No. of B.O.I']
  Contract Number     Amount Award Date  \
0          0015.1  1962488.0 2018-09-26   
1           38632   257736.0 2018-10-02   
2           38631   254736.0 2018-09-25   
3            0189   149300.0 2019-02-19   
4       LPO 1079.    81500.0 2018-12-27   

                                        Tender Title Eval Completion Date  \
0  Supply of Non Pharmaceuticals (Surgical Tubes,...           2018-09-25   
1  Supply of Non Pharmaceuticals (Surgical Tubes,...           2018-09-25   
2  Supply of Non Pharmaceuticals (Surgical Tubes,...           2018-09-19   
3  Supply of Non Pharmaceuticals (Surgical Tubes,...           2019-01-15   
4  Supply of Non Pharmaceuticals (Surgical Tubes,...           2018-12-27   

  Notification Of Award Date  Sign Date  Start Date    End Date  Created By  \
0                 2018-09-26 2018-09-26  2018-09-26  2018-10-25           1   
1                 2

In [42]:
import pandas as pd

class DataUnderstanding:
    def load_data(self, file_path):
        """Load data from an Excel file."""
        try:
            self.df = pd.read_excel(file_path)
            print(f"Data loaded successfully from {file_path}")
        except Exception as e:
            print(f"Error loading data: {e}")
            self.df = None

class DataCleaning(DataUnderstanding):
    """This class is used for data cleaning"""

    def drop_columns(self, columns):
        """Drop specified columns."""
        if self.df is not None:
            print(f"Dropping Columns: {columns}")
            self.df.drop(columns=columns, inplace=True)

    def strip_column_names(self):
        """Strip whitespace from column names."""
        if self.df is not None:
            print("Stripping whitespace from column names")
            self.df.columns = self.df.columns.str.strip()

    def convert_to_datetime(self, columns):
        """Convert specified columns to datetime format, handling errors with NaT."""
        if self.df is not None:
            for column in columns:
                print(f"Converting {column} to datetime format")
                self.df[column] = pd.to_datetime(self.df[column], errors='coerce')

    # def filter_year(self, column_name, year):
    #     """Filter rows based on a specific year in a date column."""
    #     if self.df is not None and column_name in self.df.columns:
    #         print(f"Filtering rows for the year {year} in column {column_name}")
    #         self.df = self.df[self.df[column_name].dt.year == year]

    # def remove_null_rows(self, columns=None):
    #     """Remove rows with null values, either in specified columns or all columns."""
    #     if self.df is not None:
    #         if columns:
    #             print(f"Removing rows with nulls in columns: {columns}")
    #             self.df.dropna(subset=columns, inplace=True)
    #         else:
    #             print("Removing rows with any null values")
    #             self.df.dropna(inplace=True)

    def num_duplicates(self):
        """Check and return the total number of duplicate rows."""
        if self.df is not None:
            duplicate_count = self.df.duplicated().sum()
            print(f"Number of duplicate rows: {duplicate_count}")
            return duplicate_count

    def drop_duplicates(self):
        """Drop duplicate rows from the DataFrame."""
        if self.df is not None:
            print("Dropping duplicate rows")
            self.df.drop_duplicates(inplace=True)


# Usage
data = DataCleaning()

# Load the dataset
data_path = 'published_contracts (3).xlsx'
data.load_data(data_path)

# Drop duplicates
data.drop_duplicates()

# Drop irrelevant columns
irrelevant_columns = ['Agpo Certificate Number', 'Awarded Agpo Group Id', 'Terminated', 'No. of B.O.I']
data.drop_columns(irrelevant_columns)

# Strip column names of any leading/trailing whitespace
data.strip_column_names()

# Convert specified columns to datetime format
date_columns = ['Award Date', 'Eval Completion Date', 'Sign Date', 'Created At']
data.convert_to_datetime(date_columns)

# Remove rows with null values in the date columns
# data.remove_null_rows(columns=date_columns)

# Store the final cleaned dataset in df
df = data.df

# Print the cleaned DataFrame
print(df.head())


Data loaded successfully from published_contracts (3).xlsx
Dropping duplicate rows
Dropping Columns: ['Agpo Certificate Number', 'Awarded Agpo Group Id', 'Terminated', 'No. of B.O.I']
Stripping whitespace from column names
Converting Award Date to datetime format
Converting Eval Completion Date to datetime format
Converting Sign Date to datetime format
Converting Created At to datetime format
  Contract Number     Amount Award Date  \
0          0015.1  1962488.0 2018-09-26   
1           38632   257736.0 2018-10-02   
2           38631   254736.0 2018-09-25   
3            0189   149300.0 2019-02-19   
4       LPO 1079.    81500.0 2018-12-27   

                                        Tender Title Eval Completion Date  \
0  Supply of Non Pharmaceuticals (Surgical Tubes,...           2018-09-25   
1  Supply of Non Pharmaceuticals (Surgical Tubes,...           2018-09-25   
2  Supply of Non Pharmaceuticals (Surgical Tubes,...           2018-09-19   
3  Supply of Non Pharmaceuticals (Sur

#### Checkng for duplicate values


In [43]:
# Checking for duplicates incase they were not fully removed
df.duplicated().sum()

14

In [44]:
# Subset the duplicate values
df[df.duplicated]


Unnamed: 0,Contract Number,Amount,Award Date,Tender Title,Eval Completion Date,Notification Of Award Date,Sign Date,Start Date,End Date,Created By,Financial Year,Quarter,Tender Ref.,PE Name,Supplier Name,Created At
36535,GF ATM HIV NFM-2022/2023-OIT-06– FOR SUPPLY OF...,11040000.0,2022-10-07,SUPPLY OF ARVs MEDICINES – PAEDIATRICS,2022-09-09,2022-10-12,2022-12-13,2022-12-13,2023-12-12,11047,2022/2023,Q2,GF ATM HIV NFM-2022/2023 OIT-06,Kenya Medical Supplies Authority,AUROBINDO PHARMA LIMITED,NaT
39683,180,5241680.0,2023-03-16,1186727/2022/2023,2023-03-14,2023-03-31,2023-03-31,2023-03-31,2023-05-05,11962,2022/2023,Q3,SUPPLY OF TOOLS OF TRADE,Makueni,RELIABLE PACKERS LIMITED,NaT
42198,Conract,1166100.0,2023-03-13,CONSULTANCY FOR A FEASIBILITY STUDY TO INFORM ...,2023-03-13,2023-03-14,2023-03-22,2023-04-20,2023-05-05,13648,2022/2023,Q3,NETFUND/RFP/001/2022-2023,Netfund,SUSTAINABLE BUSINESS CONSULTING LIMITED,NaT
45243,CGS/PWRT/OT/22-23/08,2734780.93,2023-03-08,Routine Maintenance of Koyule Hono Road in Nor...,2023-02-24,2023-03-28,2023-04-04,2023-04-11,2023-07-30,12241,2022/2023,Q3,CGS/PWR/OT/2022-2023/08,Siaya,REAL LUC LOGISTICS LIMITED,NaT
47429,1027,1961400.0,2023-05-23,Provision of Consultancy Services for Youth Pr...,2023-05-08,2023-05-23,2023-05-29,2023-06-05,2023-07-31,14494,2022/2023,Q4,CGS/SCM/EDUCATION/ RFP/2022-2023/002,Siaya,PLUTUS CONSULTING LIMITED,NaT
49985,658,9207620.0,2023-03-27,PROPOSED CONSTRUCTION OF A MASONRY PERIMETER ...,2023-03-10,2023-03-27,2023-04-14,2023-04-14,2023-10-13,12209,2022/2023,Q3,CGN/YSGC&SS /ONT/004/2022-2023,Nakuru,NJONTURI COMPANY LIMITED,NaT
57741,003755,717500.0,2022-05-05,PROVISION OF RETURN AIRTICKET NAIROBI-TEL AVIV,2022-05-05,2022-05-05,2022-05-05,2022-05-05,2022-05-25,12627,2021/2022,Q4,WRA/HQ/GOK/DR/141/22-23,Water Resources Authority,LORDSTOWN TRAVEL GROUP LIMITED,2023-12-11 15:46:54
59627,NETFUND/RFP/002/2023-2024,20802280.0,2023-08-29,REQUEST FOR PROPOSAL FOR CONSULTANCY TO DEVELO...,2023-08-28,2023-08-30,2023-09-01,2023-09-04,2024-01-31,13648,2023/2024,Q1,NETFUND/RFP/002/2023-2024,Netfund,JMD ADVOCATES,2024-01-16 14:45:22
60136,1387616,3925742.6,2023-10-24,TENDER FOR THE INSTALLATION OF CULVERT LINES I...,2023-10-19,2023-10-25,2023-10-25,2023-10-25,2024-01-22,12926,2023/2024,Q2,1356992-2023/2024,Kirinyaga,WAGATHITU STRATEGIES,2024-01-25 10:46:43
61068,003,9239410.0,2023-04-26,PROPOSED RE-ROOFING AND REPAIR WORKS OF OLD TO...,2023-04-13,2023-04-26,2023-05-12,2023-05-17,2023-09-15,12209,2022/2023,Q4,CGN/NKRCB/ONT/003/2022-2023,Nakuru,SOFT PROVIDER COMPANY LIMITED,2024-02-08 12:38:07


In [45]:
# Checking for duplicates based on a subset of columns
duplicates = df[df.duplicated(subset=['Contract Number'], keep=False)]
duplicates


Unnamed: 0,Contract Number,Amount,Award Date,Tender Title,Eval Completion Date,Notification Of Award Date,Sign Date,Start Date,End Date,Created By,Financial Year,Quarter,Tender Ref.,PE Name,Supplier Name,Created At
3,0189,149300.0,2019-02-19,"Supply of Non Pharmaceuticals (Surgical Tubes,...",2019-01-15,2019-02-07,2019-02-11,2019-02-11,2019-03-11,1,2018/2019,Q3,KEMSA/ONT 06/2017-2019,Embu,JIMRIVER CARGO LOGISTICS LIMITED,2019-03-15 10:03:54
7,00100,4084650.0,2019-03-12,"Supply of Non Pharmaceuticals (Surgical Tubes,...",2019-03-11,2019-03-12,2019-03-18,2019-03-18,2019-04-29,1,2018/2019,Q3,KEMSA/ONT 06/2017-2019,ICDC,KPMG ADVISORY SERVICES LIMITED,2019-03-18 10:31:41
41,0002,1576.0,2018-10-11,SUPPLY AND DELIVERY OF MINERAL WATER,2018-09-14,2018-10-12,2018-11-09,2018-11-09,2020-06-30,1,2018/2019,Q2,TNTP/SDP/T/002/2018-2020,State Department of Planning,EMMAFEST INVESTMENT,2018-11-14 21:52:25
86,8850,278000.0,2018-07-11,Supply & Delivery of Office Chairs,2018-07-06,2018-07-16,2018-07-16,2018-07-11,2018-07-30,1,2018/2019,Q1,RBA/RFQ/OFFICECHAIRS/613/726/2018,Retirement Benefits Authority,Pema General Suppliers,2018-08-14 18:51:51
99,0000,5556000.0,2018-09-03,PROVISION OF SECURITY GUARDING SERVICES,2018-08-27,2018-09-09,2018-10-01,2018-10-01,2020-06-30,1,2018/2019,Q1,HELB/T/1/2018-2019,Higher Education Loan Board,KLEEN HOMES SECURITY SERVICES LIMITED,2018-10-31 10:59:28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82689,5,1174350.0,2024-04-30,SUPPLY AND DELIVERY OF CLEANING MATERIAL,2024-04-29,2024-04-30,2024-05-08,2024-05-08,2024-06-07,13011,2023/2024,Q4,KCG/CS/QTN/1480063/2023/2024,Kilifi,KONZAI INVESTMENT,2024-10-25 12:03:30
82691,1761331,47100.0,2024-09-10,REPAIR OF KDA 671P,2024-09-10,2024-09-11,2024-09-11,2024-09-13,2024-09-13,15737,2024/2025,Q1,NDMA/SBU/13/2024-2024,National Drought Management Authority,MARALAL SAMBURU AUTO SPARES,2024-10-25 12:24:23
82692,1989,2960000.0,2024-05-02,SUPPLY AND DELIVERY OF BULK FILING CABINET,2024-04-24,2024-05-02,2024-05-02,2024-05-02,2024-05-24,13750,2023/2024,Q4,NLC/RFQ/1482181/2023-2024,National Land Commission,SHAXSHAX GRAFIX & STATIONERIES LIMITED,2024-10-25 12:30:16
82705,4298092,69800.0,2024-07-25,SUPPLY OF STARIONERY,2024-07-24,2024-07-25,2024-07-26,2024-09-10,2024-09-10,15737,2024/2025,Q1,NDMA/SBU/OO3A/24-25,National Drought Management Authority,VEE SYSTEMS COMPUTER LIMITED,2024-10-26 12:20:52


Having subsetted the duplicates by contract number it show that the dataset has no duplicate values

#### Checking for nulll values in teh dataset

In [46]:
df.isna().sum()

Unnamed: 0,0
Contract Number,72
Amount,855
Award Date,5
Tender Title,15
Eval Completion Date,32
Notification Of Award Date,0
Sign Date,12713
Start Date,988
End Date,1012
Created By,0


In [50]:

# Ensure the relevant columns are in datetime format
df['Award Date'] = pd.to_datetime(df['Award Date'], errors='coerce')
df['Sign Date'] = pd.to_datetime(df['Sign Date'], errors='coerce')

# Calculate Start Date by subtracting Award Date from Sign Date
df['Start Date'] = df['Sign Date'] - df['Award Date']

# Convert Start Date to the desired format (YYYY-MM-DD)
df['Formatted Start Date'] = (df['Award Date'] + df['Start Date']).dt.strftime('%Y-%m-%d')

# Print the DataFrame to see the changes
print(df[['Award Date', 'Sign Date', 'Formatted Start Date']].head())



  Award Date  Sign Date Formatted Start Date
0 2018-09-26 2018-09-26           2018-09-26
1 2018-10-02 2018-10-05           2018-10-05
2 2018-09-25 2018-10-05           2018-10-05
3 2019-02-19 2019-02-11           2019-02-11
4 2018-12-27 2018-12-27           2018-12-27


In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 82567 entries, 0 to 82707
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype          
---  ------                      --------------  -----          
 0   Contract Number             82495 non-null  object         
 1   Amount                      81712 non-null  float64        
 2   Award Date                  82562 non-null  datetime64[ns] 
 3   Tender Title                82552 non-null  object         
 4   Eval Completion Date        82535 non-null  datetime64[ns] 
 5   Notification Of Award Date  82567 non-null  object         
 6   Sign Date                   69854 non-null  datetime64[ns] 
 7   Start Date                  69851 non-null  timedelta64[ns]
 8   End Date                    81555 non-null  object         
 9   Created By                  82567 non-null  int64          
 10  Financial Year              82567 non-null  object         
 11  Quarter                     79084 non-null  ob

In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 82567 entries, 0 to 82707
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype          
---  ------                      --------------  -----          
 0   Contract Number             82495 non-null  object         
 1   Amount                      81712 non-null  float64        
 2   Award Date                  82562 non-null  datetime64[ns] 
 3   Tender Title                82552 non-null  object         
 4   Eval Completion Date        82535 non-null  datetime64[ns] 
 5   Notification Of Award Date  82567 non-null  object         
 6   Sign Date                   69854 non-null  datetime64[ns] 
 7   Start Date                  69851 non-null  timedelta64[ns]
 8   End Date                    81555 non-null  object         
 9   Created By                  82567 non-null  int64          
 10  Financial Year              82567 non-null  object         
 11  Quarter                     79084 non-null  ob

In [53]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import datetime

def load_and_preprocess_data(data_path):
    # Define column names
    columns = [
        'Contract_Number', 'Amount', 'Award_Date', 'Tender_Title',
        'Eval_Completion_Date', 'Notification_Date', 'Sign_Date',
        'Start_Date', 'End_Date', 'Created_By', 'Financial_Year',
        'Quarter', 'Tender_Ref', 'PE_Name', 'Supplier_Name', 'Created_At'
    ]

    # Read the data
    df = pd.read_csv(data_path, names=columns)

    # Convert date columns to datetime
    date_columns = ['Award_Date', 'Eval_Completion_Date', 'Notification_Date',
                   'Sign_Date', 'Start_Date', 'End_Date', 'Created_At']

    for col in date_columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')

    # Feature engineering
    df['Processing_Time'] = (df['Award_Date'] - df['Eval_Completion_Date']).dt.days
    df['Contract_Duration'] = (df['End_Date'] - df['Start_Date']).dt.days

    # Extract amount as numeric
    df['Amount'] = pd.to_numeric(df['Amount'], errors='coerce')

    # Create categorical features
    df['Amount_Category'] = pd.qcut(df['Amount'], q=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
    df['Duration_Category'] = pd.qcut(df['Contract_Duration'], q=3, labels=['Short', 'Medium', 'Long'])

    return df

def prepare_for_ml(df):
    # Select features for ML
    features = [
        'Amount', 'Processing_Time', 'Contract_Duration',
        'Quarter', 'PE_Name', 'Amount_Category', 'Duration_Category'
    ]

    # Create X (features)
    X = df[features].copy()

    # Encode categorical variables
    categorical_columns = ['Quarter', 'PE_Name', 'Amount_Category', 'Duration_Category']
    encoders = {}

    for col in categorical_columns:
        encoders[col] = LabelEncoder()
        X[col] = encoders[col].fit_transform(X[col].astype(str))

    # Scale numerical features
    scaler = StandardScaler()
    numerical_columns = ['Amount', 'Processing_Time', 'Contract_Duration']
    X[numerical_columns] = scaler.fit_transform(X[numerical_columns])

    return X, encoders, scaler

def train_ml_model(X, y):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train Random Forest model
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)

    # Evaluate the model
    y_pred = rf_model.predict(X_test)
    print(classification_report(y_test, y_pred))

    return rf_model, X_test, y_test

# Example usage:
if __name__ == "__main__":
    # Load and preprocess data
    df = load_and_preprocess_data('your_data.csv')

    # Prepare features for ML
    X, encoders, scaler = prepare_for_ml(df)

    # Create a target variable (example: predicting Amount_Category)
    y = encoders['Amount_Category'].transform(df['Amount_Category'])

    # Train and evaluate the model
    model, X_test, y_test = train_ml_model(X, y)

FileNotFoundError: [Errno 2] No such file or directory: 'your_data.csv'