In [None]:
import lightgbm as lgb
import numpy as np
import optuna
import sklearn.datasets
import sklearn.metrics
from sklearn.model_selection import StratifiedKFold

# Define the objective function for Optuna optimization
def objective(trial):
    # Load dataset (replace this with your actual dataset loading)
    X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)

    # Split dataset into training and validation sets
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Define hyperparameters to tune
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }

    # Define empty list to store cross-validation scores
    scores = []

    # Perform cross-validation
    for train_index, valid_index in skf.split(X, y):
        X_train, X_valid = X[train_index], X[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]

        dtrain = lgb.Dataset(X_train, label=y_train)
        dvalid = lgb.Dataset(X_valid, label=y_valid)

        # Train LightGBM model
        model = lgb.train(params, dtrain, valid_sets=[dvalid], early_stopping_rounds=100, verbose_eval=False)

        # Predict validation set and calculate binary log loss
        y_pred = model.predict(X_valid)
        score = sklearn.metrics.log_loss(y_valid, y_pred)

        scores.append(score)

    # Return average binary log loss over all folds
    return np.mean(scores)

# Create study object and optimize hyperparameters
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Print best trial parameters and value
print('Best trial:')
trial = study.best_trial
print('  Value: {:.4f}'.format(trial.value))
print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

Business Requirement Document: Python Automation Script for File Processing

1. Introduction:
The purpose of this document is to outline the business requirements for a Python automation script designed to streamline file processing tasks. The script is intended to automate the processing of six files, organized into three distinct processes, with the ability to perform checks on each process and draft corresponding emails.

2. Background:
In our organization, we frequently encounter manual file processing tasks that are time-consuming and prone to errors. These tasks involve handling six files, grouped into three processes, where each process consists of two files. To address this challenge, we have developed a Python automation script that will automate these file processing tasks, improve efficiency, and reduce errors.

3. Objectives:
The primary objectives of the Python automation script are as follows:

Automate the processing of six files grouped into three processes.
Perform checks on each process to ensure data integrity and completeness.
Draft and send emails based on the outcome of the processing, providing necessary notifications to stakeholders.
4. Scope:
The scope of the automation script includes the following:

Processing of six input files, organized into three processes: Process A, Process B, and Process C.
Implementation of checks on each process to validate data integrity and completeness.
Generation of email notifications summarizing the processing results and any detected issues.
5. Features:
The key features of the automation script include:

File Processing: The script will read, process, and analyze the contents of six input files.
Checks and Validation: Each process will undergo checks to ensure that the data meets predefined criteria and standards.
Email Notification: The script will draft email notifications summarizing the processing results, including any issues detected during validation.
6. Process Overview:
The three processes to be automated by the script are described below:

Process A: Involves the processing of two input files related to a specific task or operation.
Process B: Includes the processing of two additional input files, distinct from those in Process A.
Process C: The final process entails the processing of the remaining two input files, separate from those in Processes A and B.
7. Deliverables:
The deliverables of the automation script include:

Processed Files: Six output files containing the results of the processing for each process.
Email Notifications: Automatically generated email notifications providing a summary of the processing results and any detected issues.
8. Assumptions:
The following assumptions are made regarding the automation script:

The input files are provided in a predefined format and location accessible to the script.
The script will run on a scheduled basis to ensure timely processing of the files.
Email configuration details, including SMTP server information, are preconfigured for sending notifications.
9. Acceptance Criteria:
The automation script will be considered successful if it meets the following acceptance criteria:

Successfully processes all six input files without errors.
Performs checks on each process and flags any issues or discrepancies detected.
Generates accurate email notifications summarizing the processing results and any identified issues.
10. Conclusion:
The Python automation script described in this document will significantly enhance our file processing capabilities, leading to improved efficiency, reduced errors, and enhanced stakeholder communication. By automating repetitive tasks and implementing checks for data validation, the script will contribute to greater operational efficiency and productivity within our organization.

The data is first segregated into two categories: Single and Net Cash Flows
Net Cash Flows have payment types as “OTC NET Cash Flow” regardless of the different product types in the system. Hence, for this reason, the gross level payment types are consolidated using a product-payment type consolidation file. 
The Breaks/Fails model is trained on a dataset of two years’ data which spans across 105 product types and 110 payment types. When a new flow is passed into the model, the model predicts the outcome based on the historical data. The model considers the product type, payment type, counterparty name, cash flow amount, execution status, currency, and NTRM Type. Based on these factors, the model makes a prediction as STP only when it has 95% probability of having no future fail/break based on the historical data.
The model uses multiple decision trees to make a prediction. The training data was subsampled into smaller parts to train multiple decision trees and we use a voting classifier to get the average of all the decision tree classifications which generates the probability score. 

The historical data of breaks/fails that was used to prepare the first model has a gap. The breaks and fails that had come in the past two years data was after the pre-teams efforts which matched a lot of prior non-stp flows because of which they might not have had a  break/fail. This is why we have trained a new model with the target variable being pre-efforts. It has data of the flows which the pre-settlements team has worked on for the last four months and the model aims to identify whether a trade would require pre-team efforts. Hence even if breaks/fails model predicts a flow as STP but the pre-efforts model predicts the same as NSTP then the trade would be marked as NSTP. 

In [None]:
import pandas as pd
import re
import wordninja
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load your dataset
data = pd.read_csv('comments.csv')  # Replace with your actual file path

# Define a function to clean, segment, and preprocess the text
def clean_and_preprocess(text):
    # Remove numerical characters
    text = re.sub(r'\d+', '', text)
    # Add spaces between words
    words = wordninja.split(text)
    text = ' '.join(words)
    # Remove stop words
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Apply the function to the 'Comments' column
data['Cleaned_Comments'] = data['Comments'].apply(clean_and_preprocess)

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['Cleaned_Comments'])

# Apply K-means clustering
num_clusters = 5  # Choose the number of clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
data['Cluster'] = kmeans.fit_predict(X)

# Analyze clusters
for i in range(num_clusters):
    cluster_comments = data[data['Cluster'] == i]['Comments']
    print(f"Cluster {i}:")
    print(cluster_comments.head())
    print("\n")

# To find the most common words in each cluster
from collections import Counter

for i in range(num_clusters):
    cluster_words = ' '.join(data[data['Cluster'] == i]['Cleaned_Comments']).split()
    common_words = Counter(cluster_words).most_common(10)
    print(f"Cluster {i} common words: {common_words}")
    print("\n")

# Save the cleaned data to a new CSV file (optional)
data.to_csv('cleaned_comments_with_clusters.csv', index=False)


In [None]:
import pandas as pd
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from wordsegment import load, segment
from multiprocessing import Pool

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load wordsegment data
load()

# Load your dataset
data = pd.read_csv('comments.csv')  # Replace with your actual file path

# Define a function to clean and preprocess the text
def clean_and_preprocess(text):
    # Remove text before the first hyphen
    text = re.sub(r'^.*?-', '', text)
    # Remove numerical characters
    text = re.sub(r'\d+', '', text)
    # Add spaces between words
    words = segment(text)
    text = ' '.join(words)
    # Remove stop words
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Use multiprocessing to speed up the cleaning process
def apply_preprocessing(texts):
    with Pool() as pool:
        return pool.map(clean_and_preprocess, texts)

# Apply the preprocessing function to the 'Comments' column
data['Cleaned_Comments'] = apply_preprocessing(data['Comments'].tolist())

# Feature extraction using TF-IDF with n-grams (bigrams and trigrams)
vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_df=0.85, min_df=2)
X = vectorizer.fit_transform(data['Cleaned_Comments'])

# Apply K-means clustering
num_clusters = 5  # Choose the number of clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
data['Cluster'] = kmeans.fit_predict(X)

# Function to extract top n-grams from a cluster
def get_top_ngrams(cluster_data, n=10):
    vectorizer = TfidfVectorizer(ngram_range=(1, 3))
    X = vectorizer.fit_transform(cluster_data)
    terms = vectorizer.get_feature_names_out()
    sums = X.sum(axis=0)
    data = []
    for col, term in enumerate(terms):
        data.append((term, sums[0, col]))
    ranking = pd.DataFrame(data, columns=['term', 'rank'])
    words = ranking.sort_values('rank', ascending=False)
    return words.head(n)

# Analyze clusters
for i in range(num_clusters):
    cluster_comments = data[data['Cluster'] == i]['Cleaned_Comments']
    print(f"Cluster {i}:")
    top_ngrams = get_top_ngrams(cluster_comments)
    print(top_ngrams)
    print("\n")

# Save the cleaned data to a new CSV file (optional)
data.to_csv('cleaned_comments_with_clusters.csv', index=False)
