## The imports for this Jupyter notebook and context
pandas for DataFrames, data loading and wrangling
numpy for math utilities
scikit-learn for training a regressor with KFold cross-validation
sentence-transformers for the text to vector embeddings to become input features for scikit-learn
matplotlib for plotting distributions

In [8]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sentence_transformers import SentenceTransformer
import matplotlib

import os
import glob
from pathlib import Path
print("All imported")

All imported


## Workflow for data and procesing

pandas (load/clean) → sentence-transformers (embed text) → numpy (store as arrays) → scikit-learn (train/eval models) → matplotlib (visualize results).

## Import and clean data

In [29]:
# Define the dataset directory
# dataset_dir = "data/datasets/storypoint/IEEE TSE2018/dataset"
dataset_dir = "dummy_data" # This is a test csv to ensure cleaning of NULL values.
# Function to clean individual dataframes
def clean_dataframe(df, filename):
    """Clean a single dataframe with common preprocessing steps."""
    print(f"Cleaning {filename}:")
    print(f"  Original shape: {df.shape}")
    
    # Replace various NULL representations with actual NaN
    df = df.replace(['NULL', 'null', 'Null', ''], pd.NA)
    
    # Remove rows where ANY column has NaN/NULL values
    df = df.dropna(how='any')
    
    # Remove completely empty rows and columns
    df = df.dropna(how='all').dropna(axis=1, how='all')
    
    # Strip whitespace from string columns
    string_columns = df.select_dtypes(include=['object']).columns
    for col in string_columns:
        df[col] = df[col].astype(str).str.strip()
        # Remove rows with 'nan' strings that might have been created
        df = df[df[col] != 'nan']
        df = df[df[col] != 'None']
    
    # Convert numeric columns (story points, etc.)
    numeric_keywords = ['storypoint']
    for col in df.columns:
        if any(keyword in col.lower() for keyword in numeric_keywords):
            df[col] = pd.to_numeric(df[col], errors='coerce')
            # Remove rows where numeric conversion failed
            df = df.dropna(subset=[col])
    
    print(f"  Cleaned shape: {df.shape}")
    print(f"  Columns: {list(df.columns)}")
    print(f"  Data types: {df.dtypes.to_dict()}")
    print()
    
    return df

# Import all CSV files
csv_files = glob.glob(os.path.join(dataset_dir, "*.csv"))
dataframes = {}

print(f"Found {len(csv_files)} CSV files in {dataset_dir}")
print(f"These are the files: {csv_files}")
print("=" * 50)

for csv_file in csv_files:
    filename = Path(csv_file).name
    try:
        # Try different encodings
        df = None
        for encoding in ['utf-8', 'latin-1', 'cp1252']:
            try:
                # Read CSV keeping NULL as strings initially so we can handle them properly
                df = pd.read_csv(csv_file, encoding=encoding, keep_default_na=False)
                print(f"Successfully loaded {filename} with {encoding} encoding")
                break
            except UnicodeDecodeError:
                print(f"Failed to load {filename} with {encoding} encoding")
                continue
        
        if df is None:
            # If all encodings fail, use utf-8 with error handling
            df = pd.read_csv(csv_file, encoding='utf-8', errors='ignore', keep_default_na=False)
            print(f"Loaded {filename} with utf-8 encoding and error handling")
        
        # Clean the dataframe
        df_cleaned = clean_dataframe(df, filename)
        
        if len(df_cleaned) > 0:  # Only store if we have data left after cleaning
            # Store with filename as key (without .csv extension)
            key = filename.replace('.csv', '')
            dataframes[key] = df_cleaned
            
            # Display first few rows
            print(f"Sample data from {filename}:")
            print(df_cleaned.head())
        else:
            print(f"No data remaining after cleaning {filename}")
        
        print("=" * 50)
        
    except Exception as e:
        print(f"Error loading {filename}: {str(e)}")
        print("=" * 50)

# Summary
print(f"\nSuccessfully loaded {len(dataframes)} datasets:")
for name, df in dataframes.items():
    print(f"  {name}: {df.shape[0]} rows, {df.shape[1]} columns")
    print(f"    No missing values (all NULL entries completely removed)")

Found 1 CSV files in dummy_data
These are the files: ['dummy_data/testdataset.csv']
Successfully loaded testdataset.csv with utf-8 encoding
Cleaning testdataset.csv:
  Original shape: (7, 4)
  Cleaned shape: (2, 4)
  Columns: ['issuekey', 'title', 'description', 'storypoint']
  Data types: {'issuekey': dtype('O'), 'title': dtype('O'), 'description': dtype('O'), 'storypoint': dtype('int64')}

Sample data from testdataset.csv:
  issuekey                                              title  \
5   TESB-6  S9 Investigate MDM Call ESB Provider from MDM ...   
6  TESB-33  Common - Setup ESB Runtime Code Repositories (...   

                                         description  storypoint  
5                                            NULL123           5  
6  Code Repository: It is expected that we setup ...           3  

Successfully loaded 1 datasets:
  testdataset: 2 rows, 4 columns
    No missing values (all NULL entries completely removed)


## Embed text

In [None]:

# Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

for name, df in dataframes.items():
    # Calculate the embeddings and add as a new column to the dataframe for the story
    sentences = df['description'].tolist()
    embeddings = model.encode(sentences)

    print(f"Embeddings Shape for {name}: {embeddings.shape}")
    
    # Store as numpy arrays for scikit-learn
    df['embeddings'] = [np.array(emb) for emb in embeddings]

for name, df in dataframes.items():
    print(f"{name}: {df.shape[0]} rows, embedding dimension: {df['embeddings'].iloc[0].shape}")
    # print(f"Type of stored embedding: {type(df['embeddings'].iloc[0])}")
    print(f"Dataframe: {df}")

Embeddings Shape for testdataset: (2, 384)
testdataset: 2 rows, embedding dimension: (384,)
Type of stored embedding: <class 'numpy.ndarray'>
dataframe   issuekey                                              title  \
5   TESB-6  S9 Investigate MDM Call ESB Provider from MDM ...   
6  TESB-33  Common - Setup ESB Runtime Code Repositories (...   

                                         description  storypoint  \
5                                            NULL123           5   
6  Code Repository: It is expected that we setup ...           3   

                                          embeddings  
5  [-0.062368233, 0.059396103, -0.09936537, -0.06...  
6  [-0.013562303, -0.13194102, 0.020181717, -0.06...  


## Train model

## Visualize Data