## The imports for this Jupyter notebook and context
pandas for DataFrames, data loading and wrangling
numpy for math utilities
scikit-learn for training a regressor with KFold cross-validation
sentence-transformers for the text to vector embeddings to become input features for scikit-learn
matplotlib for plotting distributions

In [39]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sentence_transformers import SentenceTransformer
import matplotlib

import os
import glob
from pathlib import Path
print("All imported")

All imported


## Workflow for data and processing

- pandas (load/clean) 
- sentence-transformers (embed text)
- numpy (store as arrays)
- scikit-learn (train/eval models) 
- matplotlib (visualize results).

## Import and clean data

In [43]:
# Define the dataset directory
dataset_dir = "data/datasets/storypoint/IEEE TSE2018/dataset"
# dataset_dir = "dummy_data" # This is a test csv to ensure cleaning of NULL values.
# Function to clean individual dataframes
def clean_dataframe(df, filename):
    """Clean a single dataframe with common preprocessing steps."""
    print(f"Cleaning {filename}:")
    print(f"  Original shape: {df.shape}")
    
    # Replace various NULL representations with actual NaN
    df = df.replace(['NULL', 'null', 'Null', ''], pd.NA)
    
    # Remove rows where ANY column has NaN/NULL values
    df = df.dropna(how='any')
    
    # Remove completely empty rows and columns
    df = df.dropna(how='all').dropna(axis=1, how='all')
    
    # Strip whitespace from string columns
    string_columns = df.select_dtypes(include=['object']).columns
    for col in string_columns:
        df[col] = df[col].astype(str).str.strip()
        # Remove rows with 'nan' strings that might have been created
        df = df[df[col] != 'nan']
        df = df[df[col] != 'None']
    
    # Convert numeric columns (story points, etc.)
    numeric_keywords = ['storypoint']
    for col in df.columns:
        if any(keyword in col.lower() for keyword in numeric_keywords):
            df[col] = pd.to_numeric(df[col], errors='coerce')
            # Remove rows where numeric conversion failed
            df = df.dropna(subset=[col])
    
    print(f"  Cleaned shape: {df.shape}")
    print(f"  Columns: {list(df.columns)}")
    print(f"  Data types: {df.dtypes.to_dict()}")
    print()
    
    return df

# Import all CSV files
csv_files = glob.glob(os.path.join(dataset_dir, "*.csv"))
dataframes = {}

print(f"Found {len(csv_files)} CSV files in {dataset_dir}")
print(f"These are the files: {csv_files}")
print("=" * 50)

for csv_file in csv_files:
    filename = Path(csv_file).name
    try:
        # Try different encodings
        df = None
        for encoding in ['utf-8', 'latin-1', 'cp1252']:
            try:
                # Read CSV keeping NULL as strings initially so we can handle them properly
                df = pd.read_csv(csv_file, encoding=encoding, keep_default_na=False)
                print(f"Successfully loaded {filename} with {encoding} encoding")
                break
            except UnicodeDecodeError:
                print(f"Failed to load {filename} with {encoding} encoding")
                continue
        
        if df is None:
            # If all encodings fail, use utf-8 with error handling
            df = pd.read_csv(csv_file, encoding='utf-8', errors='ignore', keep_default_na=False)
            print(f"Loaded {filename} with utf-8 encoding and error handling")
        
        # Clean the dataframe
        df_cleaned = clean_dataframe(df, filename)
        
        if len(df_cleaned) > 0:  # Only store if we have data left after cleaning
            # Store with filename as key (without .csv extension)
            key = filename.replace('.csv', '')
            dataframes[key] = df_cleaned
            
            # Display first few rows
            print(f"Sample data from {filename}:")
            print(df_cleaned.head())
        else:
            print(f"No data remaining after cleaning {filename}")
        
        print("=" * 50)
        
    except Exception as e:
        print(f"Error loading {filename}: {str(e)}")
        print("=" * 50)

# Summary
print(f"\nSuccessfully loaded {len(dataframes)} datasets:")
for name, df in dataframes.items():
    print(f"  {name}: {df.shape[0]} rows, {df.shape[1]} columns")
    print(f"    No missing values (all NULL entries completely removed)")

Found 16 CSV files in data/datasets/storypoint/IEEE TSE2018/dataset
These are the files: ['data/datasets/storypoint/IEEE TSE2018/dataset/usergrid.csv', 'data/datasets/storypoint/IEEE TSE2018/dataset/bamboo.csv', 'data/datasets/storypoint/IEEE TSE2018/dataset/mule.csv', 'data/datasets/storypoint/IEEE TSE2018/dataset/jirasoftware.csv', 'data/datasets/storypoint/IEEE TSE2018/dataset/talenddataquality.csv', 'data/datasets/storypoint/IEEE TSE2018/dataset/duracloud.csv', 'data/datasets/storypoint/IEEE TSE2018/dataset/moodle.csv', 'data/datasets/storypoint/IEEE TSE2018/dataset/appceleratorstudio.csv', 'data/datasets/storypoint/IEEE TSE2018/dataset/talendesb.csv', 'data/datasets/storypoint/IEEE TSE2018/dataset/mesos.csv', 'data/datasets/storypoint/IEEE TSE2018/dataset/springxd.csv', 'data/datasets/storypoint/IEEE TSE2018/dataset/aptanastudio.csv', 'data/datasets/storypoint/IEEE TSE2018/dataset/mulestudio.csv', 'data/datasets/storypoint/IEEE TSE2018/dataset/datamanagement.csv', 'data/datasets/s

## Embed text

In [44]:

# Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

for name, df in dataframes.items():
    # Calculate the embeddings and add as a new column to the dataframe for the story
    sentences = df['description'].tolist()
    embeddings = model.encode(sentences)

    print(f"Embeddings Shape for {name}: {embeddings.shape}")
    
    # Store as numpy arrays for scikit-learn
    df['embeddings'] = [np.array(emb) for emb in embeddings]

for name, df in dataframes.items():
    print(f"{name}: {df.shape[0]} rows, embedding dimension: {df['embeddings'].iloc[0].shape}")
    # print(f"Type of stored embedding: {type(df['embeddings'].iloc[0])}")
    print(f"Dataframe: {df}")

Embeddings Shape for usergrid: (333, 384)
Embeddings Shape for bamboo: (374, 384)
Embeddings Shape for mule: (889, 384)
Embeddings Shape for jirasoftware: (286, 384)
Embeddings Shape for talenddataquality: (1136, 384)
Embeddings Shape for duracloud: (613, 384)
Embeddings Shape for moodle: (1166, 384)
Embeddings Shape for appceleratorstudio: (2876, 384)
Embeddings Shape for talendesb: (775, 384)
Embeddings Shape for mesos: (1562, 384)
Embeddings Shape for springxd: (3056, 384)
Embeddings Shape for aptanastudio: (771, 384)
Embeddings Shape for mulestudio: (732, 384)
Embeddings Shape for datamanagement: (4030, 384)
Embeddings Shape for titanium: (2122, 384)
Embeddings Shape for clover: (361, 384)
usergrid: 333 rows, embedding dimension: (384,)
Dataframe:           issuekey                                              title  \
0      USERGRID-16  Asset data does not correctly obey contextual ...   
1      USERGRID-17              Expose refresh token at the REST tier   
2      USERGRID-19 

## Train model

In [None]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, root_mean_squared_error

from sklearn.linear_model import Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor

df = dataframes['clover']
print(df)
# Stack into a 2D array: shape (n_samples, n_dims)
X = np.vstack(df['embeddings'].values)
y = df['storypoint'].astype(float).values  # or int if you prefer

# Optional: quick sanity checks
assert X.ndim == 2, "Embeddings didn't stack to 2D; check shapes."
assert len(X) == len(y), "Feature/target length mismatch."

# 2) Train/validation split (hold out a test set for an honest check)
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42)

# 3) Define a few good baselines
models = {
    "Ridge": make_pipeline(StandardScaler(), Ridge(alpha=1.0, random_state=42)),
    "ElasticNet": make_pipeline(StandardScaler(), ElasticNet(alpha=0.05, l1_ratio=0.2, random_state=42)),
    "SVR(RBF)": make_pipeline(StandardScaler(), SVR(kernel="rbf", C=2.0, epsilon=0.2)),
    "RandomForest": RandomForestRegressor(n_estimators=300, max_depth=None, random_state=42, n_jobs=-1),
    "HistGBR": HistGradientBoostingRegressor(loss="squared_error", max_depth=None, random_state=42)
}

# 4) Cross-validate each model with MAE (lower is better)
cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_results = {}
for name, model in models.items():
    scores = cross_val_score(model, X_tr, y_tr, scoring="neg_mean_absolute_error", cv=cv, n_jobs=-1)
    cv_results[name] = -scores.mean()

# 5) Pick the best by CV MAE and fit it on the full training set
best_name = min(cv_results, key=cv_results.get)
best_model = models[best_name]
best_model.fit(X_tr, y_tr)

# 6) Evaluate on the held-out test set
pred = best_model.predict(X_te)
mae = mean_absolute_error(y_te, pred)
rmse = root_mean_squared_error(y_te, pred)

print("CV MAE per model:", cv_results)
print(f"Best model: {best_name}")
print(f"Test MAE:  {mae:.3f}")
print(f"Test RMSE: {rmse:.3f}")

# 7) (Optional) If story points are discrete, you can round predictions
pred_rounded = np.rint(pred).astype(int)
print("Rounded Test MAE:", mean_absolute_error(y_te, pred_rounded))


      issuekey                                              title  \
0    CLOV-1086                 Line coverage data is inconsistent   
1     CLOV-379  Surefire classpath is incorrect when depending...   
5    CLOV-1085   Message in Balloon after clean snapshot is bogus   
6    CLOV-1084  No instrumentation Done, Always get two Tests ...   
8     CLOV-701  Test Columns are empty (or -1) in project.js w...   
..         ...                                                ...   
376  CLOV-1890  Provide simple reporting task which generate C...   
380  CLOV-1951                           Support Eclipse 4.6 Neon   
381  CLOV-1953  Clover does not instrument all nodes in Groovy...   
382  CLOV-1955                       Support InteliiJ IDEA 2016.2   
383  CLOV-1960  Instrument Groovy traits and show code coverag...   

                                           description  storypoint  \
0    I'm running 2.4.1 on IDEA 7 and get inconsiste...           2   
1    We have two different appl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

CV Accuracy per model: {'LogReg': np.float64(0.2431336963097399), 'SVC(RBF)': np.float64(0.3159104658197217), 'RandomForest': np.float64(0.3194797338173019), 'HistGB': np.float64(0.2983061101028433)}
Test Accuracy: 0.2876712328767123

Classification Report:
               precision    recall  f1-score   support

           1       0.31      0.86      0.45        22
           2       0.14      0.06      0.09        16
           3       0.50      0.08      0.14        12
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         9
           8       0.00      0.00      0.00         5
          13       0.00      0.00      0.00         5
          20       0.00      0.00      0.00         2
          40       0.00      0.00      0.00         1

    accuracy                           0.29        73
   macro avg       0.11      0.11      0.08        73
weighted avg       0.21      0.29      0.18        73


Confusion Matrix:
 [[19  2  1  0  0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## Visualize Data