In [9]:
# Load & Understand Your Data
import pandas as pd

train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")

print(train_df.head())
print("\nColumns:\n", train_df.columns)


                                           File Name  Aggrement Value  \
0  6683127-House-Rental-Contract-GERALDINE-GALINA...             6500   
1  6683129-House-Rental-Contract-Geraldine-Galina...             6500   
2                        18325926-Rental-Agreement-1             4000   
3                          24158401-Rental-Agreement            12000   
4                          36199312-Rental-Agreement             3800   

  Aggrement Start Date Aggrement End Date  Renewal Notice (Days)  \
0           20.05.2007         20.05.2008                   15.0   
1           20.05.2007         20.05.2008                   15.0   
2           05.12.2008         31.11.2009                   90.0   
3           01.04.2008         31.03.2009                   60.0   
4           01.05.2010         31.04.2011                   30.0   

                                           Party One                 Party Two  
0  Antonio Levy S. Ingles, Jr. and/or Mary Rose C...     GERALDINE Q. G

In [10]:
train_df.columns = train_df.columns.str.strip()

# Check fixed column names
print(train_df.columns)

Index(['File Name', 'Aggrement Value', 'Aggrement Start Date',
       'Aggrement End Date', 'Renewal Notice (Days)', 'Party One',
       'Party Two'],
      dtype='object')


In [12]:
for col in train_df.columns:
    print(repr(col))

'File Name'
'Aggrement Value'
'Aggrement Start Date'
'Aggrement End Date'
'Renewal Notice (Days)'
'Party One'
'Party Two'


In [13]:
train_df = train_df.rename(columns={
    "Aggrement Value": "Agreement Value",
    "Aggrement Start Date": "Agreement Start Date",
    "Aggrement End Date": "Agreement End Date"
})

print(train_df.columns)



Index(['File Name', 'Agreement Value', 'Agreement Start Date',
       'Agreement End Date', 'Renewal Notice (Days)', 'Party One',
       'Party Two'],
      dtype='object')


In [15]:
train_df = train_df.fillna("")

for col in ["Agreement Value", "Agreement Start Date", "Agreement End Date", "Renewal Notice (Days)"]:
    train_df[col] = train_df[col].astype(str)

train_df["contract_text"] = (
    train_df["Party One"] + " " +
    train_df["Party Two"] + " " +
    train_df["Agreement Value"] + " " +
    train_df["Agreement Start Date"] + " " +
    train_df["Agreement End Date"] + " " +
    train_df["Renewal Notice (Days)"]
)

train_df[["contract_text"]].head()



Unnamed: 0,contract_text
0,"Antonio Levy S. Ingles, Jr. and/or Mary Rose C..."
1,"Antonio Levy S. Ingles, Jr. and/or Mary Rose C..."
2,MR.K.Kuttan P.M. Narayana Namboodri 4000 05....
3,Hanumaiah Vishal Bhardwaj 12000 01.04.2008 ...
4,Balaji.R Kartheek R 3800 01.05.2010 31.04.201...


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression

X = train_df["contract_text"]
y = train_df[[
    "Party One",
    "Party Two",
    "Agreement Value",
    "Agreement Start Date",
    "Agreement End Date",
    "Renewal Notice (Days)"
]]

vectorizer = TfidfVectorizer()
X_vec = vectorizer.fit_transform(X)

model = MultiOutputClassifier(LogisticRegression(max_iter=2000))
model.fit(X_vec, y)

print("ML model trained successfully")


ML model trained successfully


In [18]:
print(test_df.columns)

Index(['File Name', 'Aggrement Value', 'Aggrement Start Date',
       'Aggrement End Date', 'Renewal Notice (Days)', 'Party One',
       'Party Two'],
      dtype='object')


In [19]:
# Prepare Test Contract Text
test_df = test_df.rename(columns={
    "Aggrement Value": "Agreement Value",
    "Aggrement Start Date": "Agreement Start Date",
    "Aggrement End Date": "Agreement End Date"
})

print(test_df.columns)


Index(['File Name', 'Agreement Value', 'Agreement Start Date',
       'Agreement End Date', 'Renewal Notice (Days)', 'Party One',
       'Party Two'],
      dtype='object')


In [20]:
test_df = test_df.fillna("")

for col in ["Agreement Value", "Agreement Start Date", "Agreement End Date", "Renewal Notice (Days)"]:
    test_df[col] = test_df[col].astype(str)

test_df["contract_text"] = (
    test_df["Party One"] + " " +
    test_df["Party Two"] + " " +
    test_df["Agreement Value"] + " " +
    test_df["Agreement Start Date"] + " " +
    test_df["Agreement End Date"] + " " +
    test_df["Renewal Notice (Days)"]
)

test_df[["contract_text"]].head()


Unnamed: 0,contract_text
0,Hanumaiah Vishal Bhardwaj 12000 01.04.2008 ...
1,S.Sakunthala V.V.Ravi Kian 9000 01.04.2010 31...
2,V.K.NATARAJ VYSHNAVI DAIRY SPECIALITIES Priv...
3,KAPIL MEHROTRA .B.Kishore 15000 07.07.2013 ...


In [21]:
X_test = vectorizer.transform(test_df["contract_text"])
pred = model.predict(X_test)

pred_df = pd.DataFrame(pred, columns=[
    "Pred Party One",
    "Pred Party Two",
    "Pred Agreement Value",
    "Pred Start Date",
    "Pred End Date",
    "Pred Renewal Notice"
])

pred_df.head()


Unnamed: 0,Pred Party One,Pred Party Two,Pred Agreement Value,Pred Start Date,Pred End Date,Pred Renewal Notice
0,Hanumaiah,Vishal Bhardwaj,12000,01.04.2008,31.03.2009,60.0
1,P C MATHEW,L GOPINATH,9000,01.04.2010,31.02.2011,60.0
2,"Antonio Levy S. Ingles, Jr. and/or Mary Rose C...",GERALDINE Q. GALINATO,6500,20.05.2007,20.05.2008,90.0
3,"Antonio Levy S. Ingles, Jr. and/or Mary Rose C...",GERALDINE Q. GALINATO,6500,20.05.2007,20.05.2008,90.0


In [23]:
import pickle

pickle.dump(model, open("../model/contract_model.pkl", "wb"))
pickle.dump(vectorizer, open("../model/vectorizer.pkl", "wb"))

print("FINAL MODEL SAVED SUCCESSFULLY")


FINAL MODEL SAVED SUCCESSFULLY
