# Creating framework which gives feedback based on key attributes

## Importing Dataset

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../../Datasets/length_optimization_output.csv')

In [3]:
df.columns

Index(['IMPACT', 'SUBSTANCE', 'APPROPRIATENESS', 'SOUNDNESS_CORRECTNESS',
       'ORIGINALITY', 'RECOMMENDATION', 'CLARITY', 'REVIEWER_CONFIDENCE',
       'comments', 'strengths', 'weaknesses', 'general_discussion',
       'content_relevance', 'evidence_support', 'strength_argument_score',
       'weakness_argument_score', 'argument_strength', 'readability_index',
       'sentence_complexity', 'technical_depth', 'total_word_count',
       'strength_word_count', 'weakness_word_count', 'discussion_word_count',
       'normalized_length', 'unique_key_points', 'information_density',
       'unique_insights_per_word', 'optimization_score', 'composite_score',
       'review_quality', 'adjusted_argument_strength'],
      dtype='object')

In [4]:
df.head()

Unnamed: 0,IMPACT,SUBSTANCE,APPROPRIATENESS,SOUNDNESS_CORRECTNESS,ORIGINALITY,RECOMMENDATION,CLARITY,REVIEWER_CONFIDENCE,comments,strengths,...,weakness_word_count,discussion_word_count,normalized_length,unique_key_points,information_density,unique_insights_per_word,optimization_score,composite_score,review_quality,adjusted_argument_strength
0,0.6,0.8,1.0,0.8,0.6,0.6,0.6,0.6,- Strengths:\n* Outperforms ALIGN in supervise...,* Outperforms ALIGN in supervised entity linki...,...,167,0,0.157967,3,0.013043,18.991304,0.143186,3.474115,Poor,0.073401
1,0.6,0.8,1.0,0.8,0.6,0.8,0.6,0.8,This paper addresses the problem of disambigua...,No information available,...,0,0,0.497253,7,0.009669,14.077348,-0.03161,5.827519,Moderate,0.017836
2,0.6,0.8,0.8,0.8,0.6,0.8,0.6,0.8,"- Strengths:\nGood ideas, simple neural learni...","Good ideas, simple neural learning, interestin...",...,0,0,0.208104,3,0.009901,14.415842,0.099582,2.552753,Poor,0.034617
3,0.6,0.8,1.0,0.8,0.6,0.6,1.0,0.8,- Strengths:\nThe idea of hard monotonic atten...,The idea of hard monotonic attention is new an...,...,46,133,0.136676,2,0.01005,14.633166,0.132916,3.615575,Poor,0.067018
4,0.6,0.8,1.0,0.8,0.6,0.6,1.0,0.6,- Strengths: A new encoder-decoder model is pr...,No information available,...,0,848,0.609203,9,0.010147,14.773393,-0.070128,18.26179,Excellent,0.012103


## Calculating Thresholds

In [5]:
import numpy as np
from scipy.stats import gaussian_kde
from sklearn.mixture import GaussianMixture
from scipy.signal import argrelextrema

In [6]:
def pct_thresholds(series, lower_pct=0.25, upper_pct=0.75):
    """Return the lower and upper percentile cutoffs for a pandas Series."""
    p_low, p_high = series.quantile([lower_pct, upper_pct])
    return float(p_low), float(p_high)

In [7]:
def kde_valley(series, grid_size=1000):
    """
    Fit a 1D KDE and return the x‐value of the most prominent valley
    between the two highest peaks.
    """
    x = series.dropna().values
    kde = gaussian_kde(x)
    xs = np.linspace(x.min(), x.max(), grid_size)
    ys = kde(xs)

    # find peaks and valleys
    peaks = argrelextrema(ys, np.greater)[0]
    valleys = argrelextrema(ys, np.less)[0]

    if len(peaks) < 2 or len(valleys) < 1:
        # fallback to median
        return float(series.median())

    # choose the first valley that lies between the two highest peaks
    peak_heights = ys[peaks]
    top2 = peaks[np.argsort(peak_heights)[-2:]]
    low, high = sorted(top2)
    valid_valleys = [v for v in valleys if low < v < high]
    if not valid_valleys:
        return float(series.median())
    # map back to x coordinate
    return float(xs[valid_valleys[0]])

In [8]:
derived_thresholds = {}

In [9]:
# normalized_length: use 25th & 75th percentiles
derived_thresholds['normalized_length'] = pct_thresholds(df['normalized_length'], 0.25, 0.75)

In [10]:
# unique_key_points: 25th & 75th percentiles
derived_thresholds['unique_key_points'] = pct_thresholds(df['unique_key_points'], 0.25, 0.75)

In [11]:
# information_density: 25th & 75th percentiles
derived_thresholds['information_density'] = pct_thresholds(df['information_density'], 0.25, 0.75)

In [12]:
# unique_insights_per_word: median split
derived_thresholds['unique_insights_per_word'] = float(df['unique_insights_per_word'].median())

In [13]:
# composite_score: via 3‑way GMM, so reusing those two intersections
derived_thresholds['composite_score'] = (4.5941101516587315, 10.581525294084457) 

In [14]:
# adjusted_argument_strength: 75th percentile for “high” strength
derived_thresholds['adjusted_argument_strength'] = float(df['adjusted_argument_strength'].quantile(0.75))

In [15]:
print("Empirical THRESHOLDS =")
for k,v in derived_thresholds.items():
    print(f"  {k}: {v}")

Empirical THRESHOLDS =
  normalized_length: (0.1510989010989011, 0.3722527472527472)
  unique_key_points: (2.0, 5.0)
  information_density: (0.00727669710202455, 0.011291644452510826)
  unique_insights_per_word: 13.607476635514018
  composite_score: (4.5941101516587315, 10.581525294084457)
  adjusted_argument_strength: 0.0616150390018116


## Key Metrics and Thresholds

### normalized_length:

Range: Optimal between 0.1510989010989011 – 0.3722527472527472

Below 0.1510989010989011: Too concise (expand content).

Above 0.3722527472527472: Too verbose (condense content).

### unique_key_points:

Range: Optimal between 2 – 5

Below 2: Insufficient insights (add more unique ideas).

Above 5: Overloaded with ideas (streamline).

### information_density:

Range: Optimal between 0.00727669710202455 – 0.011291644452510826

Below 0.00727669710202455: Low density (improve relevance and focus).

Above 0.011291644452510826: High density (remove less relevant details).

### unique_insights_per_word:

Threshold: > 13.607476635514018

Below 13.607476635514018: Redundant content (suggest condensation).

### adjusted_argument_strength:

Range: Optimal > 0.0616150390018116

Below 0.0616150390018116: Weak arguments (recommend improving logical consistency or evidence).

### review_quality:

Categories: 'Excellent', 'Moderate', 'Poor'.

If 'Low': Significant revision required.

If 'Moderate': Minor adjustments for improvement.

If 'High': No major changes needed.

## Creating Framework

In [16]:
THRESHOLDS = derived_thresholds

In [17]:
def heuristic_optimization(row):
    suggestions = []

    if row["strength_word_count"] > 100 and row["strength_argument_score"] < THRESHOLDS["adjusted_argument_strength"]:
        suggestions.append("Summarize redundant strengths.")
    elif row["strength_word_count"] < 50 and row["strength_argument_score"] < THRESHOLDS["adjusted_argument_strength"]:
        suggestions.append("Add more impactful strengths.")

    if row["weakness_word_count"] > 100 and row["weakness_argument_score"] < THRESHOLDS["adjusted_argument_strength"]:
        suggestions.append("Remove repetitive criticisms.")
    elif row["weakness_word_count"] < 50 and row["weakness_argument_score"] < THRESHOLDS["adjusted_argument_strength"]:
        suggestions.append("Add specific, actionable weaknesses.")

    if row["discussion_word_count"] < 100 and row["information_density"] < THRESHOLDS["information_density"][0]:
        suggestions.append("Elaborate with new insights or examples.")
    elif row["discussion_word_count"] > 300 and row["information_density"] > THRESHOLDS["information_density"][1]:
        suggestions.append("Summarize key discussion points.")

    if row["normalized_length"] < THRESHOLDS["normalized_length"][0]:
        suggestions.append("Expand sections for better coverage.")
    elif row["normalized_length"] > THRESHOLDS["normalized_length"][1]:
        suggestions.append("Condense content to improve readability.")

    if row["unique_key_points"] < THRESHOLDS["unique_key_points"][0]:
        suggestions.append("Add more unique insights.")
    elif row["unique_key_points"] > THRESHOLDS["unique_key_points"][1]:
        suggestions.append("Streamline ideas for clarity.")

    if row["review_quality"] == "Low":
        suggestions.append("Significant revisions required.")
    elif row["review_quality"] == "Moderate":
        suggestions.append("Minor refinements recommended.")

    return suggestions

In [18]:
df["optimization_suggestions"] = df.apply(heuristic_optimization, axis=1)

In [19]:
display(df[["optimization_suggestions"]].head(10))

Unnamed: 0,optimization_suggestions
0,[]
1,"[Condense content to improve readability., Str..."
2,[]
3,[Expand sections for better coverage.]
4,"[Condense content to improve readability., Str..."
5,[]
6,[]
7,[Expand sections for better coverage.]
8,"[Elaborate with new insights or examples., Con..."
9,"[Elaborate with new insights or examples., Exp..."
