# Metadata

**L1 Taxonomy** - Data Analysis and Engineering

**L2 Taxonomy** - Pandas for DataFrames

**Subtopic** - Using .eval() and .assign() for efficient DataFrame computations

**Use Case** - Develop a Python module that reads a CSV file into a DataFrame, performs computations using .eval() and .assign() methods, and writes the results back to a new CSV file.

**Programming Language** - Python

**Target Model** - GPT-4o

# Setup

```requirements.txt
pandas
```


# Prompt

I'm trying to solve a data problem with pandas. The input CSV file will alwyas have columns: A,B,C,D,Group and the output CSV file would have two additional columns E and F.

**Input Format**

- A CSV file containing numerical and categorical columns.

**Output Format**

- A new CSV file with 2 new columns E and F.


**Examples**
```
Input CSV:

A,B,C,D,Group
1,2,3,4,X
5,,2,1,Y
,,1,1,X
4,4,4,4,Y

Output CSV:

A,B,C,D,Group,E,F
1,2,3,4,X,-9.0,-9.0
5,,2,1,Y,,-8.0
,,1,1,X,,-9.0
4,4,4,4,Y,-8.0,-8.0
```






# Requirements

**Explicit and Implicit Points**

- Use .eval() for all expressions involving A, B, C, D.
- Use .assign() to compute new columns.
- For handling NaNs, only use .eval() or .assign()
- The output file must contain all original columns plus:
  - A new column E, computed via a formula using .eval(), `E = A + B - C * D`.
  - For each group, compute the mean and median of E using .groupby().
  - A new column F, created using .assign() that is based on E and condition, `F = E.mean() if Group == 'X' else E.median()` applied per group.


**Solution Expectations**

- Computes columns E and F.
- Saves the resulting DataFrame to a new file.
- The solution must:
  - Not use any external libraries other than pandas.
  - Not use .fillna(), or apply().

**Function Signatures**

```python
def transform_csv(filename: str) -> None:
    pass
```

**Edge Case Behavior**

- If all values in a row are NaN, ensure it does not break .eval().
- If a group does not contain at least one complete row, ensure logic for F still works.
- Do not allow default pandas NaN behavior to override logic.
- If the CSV file is empty print "Empty CSV"

**Constraints**

- Must work even when A, B, C, D are partially missing.
- All new column logic must be expressed inline in .assign().
- No use of helper functions is allowed.
- Only pandas is allowed as a external library.

In [None]:
# code

"""This module transforms a CSV file."""

import pandas as pd


def transform_csv(filename: str) -> None:
    """Transform the input CSV.

    By computing columns E and F.
    Save the result.
    """
    df = pd.read_csv(filename)

    if df.empty:
        print("Empty CSV")
        return

    df.eval('E = A + B - C * D', inplace=True)

    group_means = df.groupby('Group')['E'].transform('mean')
    group_medians = df.groupby('Group')['E'].transform('median')

    df = df.assign(F=group_means.where(df['Group'] == 'X', group_medians))

    output_filename = 'output_' + filename
    df.to_csv(output_filename, index=False)


transform_csv('csv2_test.csv')


In [None]:
# tests

"""
Unit tests for transform_csv, ensuring correct E/F computation, file output, and edge case handling based on the prompt requirements.
"""
import os
import tempfile
import unittest
import pandas as pd
from io import StringIO
import sys

from transform_module import transform_csv


class TestTransformCsv(unittest.TestCase):
    """
    Test suite for transform_csv function.
    """

    def setUp(self) -> None:
        """
        Prepare a temporary directory for test isolation.
        """
        self.tempdir = tempfile.TemporaryDirectory()
        self.orig_cwd = os.getcwd()
        os.chdir(self.tempdir.name)

    def tearDown(self) -> None:
        """
        Clean up temporary directory and restore working directory.
        """
        os.chdir(self.orig_cwd)
        self.tempdir.cleanup()

    def write_csv(self, filename: str, content: str) -> None:
        """
        Write raw CSV content to a file without index.

        Args:
            filename: Name of the CSV file to create.
            content: CSV text to write.
        """
        with open(filename, 'w') as f:
            f.write(content)

    def find_output_file(self) -> str:
        """
        Locate the output CSV generated by transform_csv.

        Returns:
            Filename of the transformed CSV.
        """
        files = [f for f in os.listdir('.') if f.endswith('.csv') and f != 'input.csv' and f != 'empty.csv' and f != 'nanrow.csv']
        self.assertEqual(len(files), 1, 'Expected exactly one output CSV file')
        return files[0]

    def test_example_transformation(self) -> None:
        """
        Verify E = A + B - C * D and F uses mean for Group X and median for Group Y.
        """
        csv_in = (
            "A,B,C,D,Group\n"
            "1,2,3,4,X\n"
            "5,,2,1,Y\n"
            ",,1,1,X\n"
            "4,4,4,4,Y\n"
        )
        self.write_csv('input.csv', csv_in)
        transform_csv('input.csv')
        out = self.find_output_file()
        df = pd.read_csv(out)
        # Compute expected E values
        expected_e = [1+2-3*4, None, None, 4+4-4*4]
        # Group stats: X rows E valid = [-9], mean=-9; Y rows E valid=[?, -8] median = -8
        expected_f = [-9, -8, -9, -8]
        for idx, (e_exp, f_exp) in enumerate(zip(expected_e, expected_f)):
            if e_exp is None:
                self.assertTrue(pd.isna(df.loc[idx, 'E']))
            else:
                self.assertAlmostEqual(df.loc[idx, 'E'], float(e_exp))
            self.assertAlmostEqual(df.loc[idx, 'F'], float(f_exp))

    def test_empty_csv(self) -> None:
        """
        An empty CSV should print 'Empty CSV' and not produce an output file.
        """
        self.write_csv('empty.csv', 'A,B,C,D,Group\n')
        captured = StringIO()
        sys_stdout = sys.stdout
        sys.stdout = captured
        transform_csv('empty.csv')
        sys.stdout = sys_stdout
        self.assertIn('Empty CSV', captured.getvalue())
        files = [f for f in os.listdir('.') if f.endswith('.csv')]
        self.assertEqual(files, ['empty.csv'])

    def test_all_nan_row(self) -> None:
        """
        Rows with all NaN in A-D must not break evaluation and should yield NaN E.
        """
        csv_in = (
            "A,B,C,D,Group\n"
            ",,, ,X\n"
        )
        self.write_csv('nanrow.csv', csv_in)
        transform_csv('nanrow.csv')
        out = self.find_output_file()
        df = pd.read_csv(out)
        self.assertTrue(pd.isna(df.loc[0, 'E']))
        self.assertTrue(pd.isna(df.loc[0, 'F']))

    def test_missing_file_raises(self) -> None:
        """
        Supplying a non-existent file path should raise FileNotFoundError.
        """
        with self.assertRaises(FileNotFoundError):
            transform_csv('nofile.csv')

    def test_partial_nan_computation(self) -> None:
        """
        Partial NaNs in A-D yield NaN E but F computed per group mean/median.
        """
        csv_in = (
            "A,B,C,D,Group\n"
            "2,3, , ,X\n"
            "2,2,1,1,X\n"
        )
        self.write_csv('input.csv', csv_in)
        transform_csv('input.csv')
        out = self.find_output_file()
        df = pd.read_csv(out)
        self.assertTrue(pd.isna(df.loc[0, 'E']))
        # Only valid E is row1 => 2+2-1*1=3, so mean=median=3
        self.assertAlmostEqual(df.loc[0, 'F'], 3.0)
        self.assertAlmostEqual(df.loc[1, 'E'], 3.0)

    def test_original_columns_retained(self) -> None:
        """
        Output CSV must preserve A,B,C,D,Group in order before E and F.
        """
        self.write_csv('input.csv', 'A,B,C,D,Group\n1,1,1,1,X\n')
        transform_csv('input.csv')
        out = self.find_output_file()
        df = pd.read_csv(out)
        self.assertEqual(list(df.columns), ['A', 'B', 'C', 'D', 'Group', 'E', 'F'])

    def test_output_file_exists(self) -> None:
        """
        After transformation, a new CSV file should exist.
        """
        self.write_csv('data.csv', 'A,B,C,D,Group\n1,2,3,4,X\n')
        transform_csv('data.csv')
        out = self.find_output_file()
        self.assertTrue(os.path.exists(out))

    def test_group_statistics(self) -> None:
        """
        F should use mean for group X and median for group Y explicitly.
        """
        csv_in = (
            "A,B,C,D,Group\n"
            "1,1,1,1,X\n"
            "2,2,2,2,X\n"
            "3,3,3,3,Y\n"
            "4,4,4,4,Y\n"
        )
        self.write_csv('input.csv', csv_in)
        transform_csv('input.csv')
        out = self.find_output_file()
        df = pd.read_csv(out)
        # E for X rows: [1+1-1*1=1, 2+2-2*2=0] mean = 0.5 -> F
        x_f = df[df['Group']=='X']['F'].iloc[0]
        self.assertAlmostEqual(x_f, 0.5)
        # E for Y rows: [3+3-3*3=-3, 4+4-4*4=-4] median = avg of middle => -3.5
        y_f = df[df['Group']=='Y']['F'].iloc[0]
        self.assertAlmostEqual(y_f, -3.5)

if __name__ == '__main__':
    unittest.main()



# Model Breaking Proof

#### Model Breaking Task URL: <https://rlhf-v3.turing.com/prompt/704cd37c-b071-4b8a-8f19-83bd57f68682?origin=https%3A%2F%2Flabeling-ms.turing.com&redirect_url=https%3A%2F%2Flabeling-ms.turing.com%2Fconversations%2F74353%2Fview>

#### Model code:

```python
import pandas as pd

def transform_csv(filename: str) -> None:
    try:
        # Read the input CSV file
        df = pd.read_csv(filename)
        
        if df.empty:
            print("Empty CSV")
            return
        
        # Calculate column E using .eval(), handling NaN implicitly
        df = df.eval('E = A + B - C * D')
        
        # Group by the 'Group' column
        group_means = df.groupby('Group')['E'].transform('mean')
        group_medians = df.groupby('Group')['E'].transform('median')

        # <Issue>: <Uses .apply(), which is not allowed and this violates the prompt.>
        
        # Add column F using .assign(), evaluated per group
        df = df.assign(F=lambda x: x.apply(lambda row: group_means[row.name] if row['Group'] == 'X' else group_medians[row.name], axis=1))
        
        # Save the resulting DataFrame to 'output.csv'
        df.to_csv('output.csv', index=False)

    except pd.errors.EmptyDataError:
        print("Empty CSV")
```