# Processing

We'll use alpha-numeric sequences, and only alpha-numeric sequences, as tokens. Alpha-numeric tokens contain only letters a-z and numbers 0-9 (no other characters). In other words, we'll tokenize on punctuation to generate n-gram statistics.

In [1]:
import pandas as pd
df=pd.read_csv('Training_Data.csv',index_col=0)
LABELS=['Function',
 'Use',
 'Sharing',
 'Reporting',
 'Student_Type',
 'Position_Type',
 'Object_Type',
 'Pre_K',
 'Operating_Status']
# Define the lambda function: categorize_label
categorize_label = lambda x: x.astype('category')

# Convert df[LABELS] to a categorical type
df[LABELS] = df[LABELS].apply(categorize_label,axis=0)
print(df.dtypes)

Function                  category
Use                       category
Sharing                   category
Reporting                 category
Student_Type              category
Position_Type             category
Object_Type               category
Pre_K                     category
Operating_Status          category
Object_Description          object
Text_2                      object
SubFund_Description         object
Job_Title_Description       object
Text_3                      object
Text_4                      object
Sub_Object_Description      object
Location_Description        object
FTE                        float64
Function_Description        object
Facility_or_Department      object
Position_Extra              object
Total                      float64
Program_Description         object
Fund_Description            object
Text_1                      object
dtype: object


In [2]:
import numpy as np
import pandas as pd

def multilabel_sample(y, size=1000, min_count=5, seed=None):
    """ Takes a matrix of binary labels `y` and returns
        the indices for a sample of size `size` if
        `size` > 1 or `size` * len(y) if size =< 1.
        The sample is guaranteed to have > `min_count` of
        each label.
    """
    try:
        if (np.unique(y).astype(int) != np.array([0, 1])).all():
            raise ValueError()
    except (TypeError, ValueError):
        raise ValueError('multilabel_sample only works with binary indicator matrices')

    if (y.sum(axis=0) < min_count).any():
        raise ValueError('Some classes do not have enough examples. Change min_count if necessary.')

    if size <= 1:
        size = np.floor(y.shape[0] * size)

    if y.shape[1] * min_count > size:
        msg = "Size less than number of columns * min_count, returning {} items instead of {}."
        warn(msg.format(y.shape[1] * min_count, size))
        size = y.shape[1] * min_count

    rng = np.random.RandomState(seed if seed is not None else np.random.randint(1))

    if isinstance(y, pd.DataFrame):
        choices = y.index
        y = y.values
    else:
        choices = np.arange(y.shape[0])

    sample_idxs = np.array([], dtype=choices.dtype)

    # first, guarantee > min_count of each label
    for j in range(y.shape[1]):
        label_choices = choices[y[:, j] == 1]
        label_idxs_sampled = rng.choice(label_choices, size=min_count, replace=False)
        sample_idxs = np.concatenate([label_idxs_sampled, sample_idxs])

    sample_idxs = np.unique(sample_idxs)

    # now that we have at least min_count of each, we can just random sample
    sample_count = int(size - sample_idxs.shape[0])

    # get sample_count indices from remaining choices
    remaining_choices = np.setdiff1d(choices, sample_idxs)
    remaining_sampled = rng.choice(remaining_choices,
                                   size=sample_count,
                                   replace=False)

    return np.concatenate([sample_idxs, remaining_sampled])


def multilabel_sample_dataframe(df, labels, size, min_count=5, seed=None):
    """ Takes a dataframe `df` and returns a sample of size `size` where all
        classes in the binary matrix `labels` are represented at
        least `min_count` times.
    """
    idxs = multilabel_sample(labels, size=size, min_count=min_count, seed=seed)
    return df.loc[idxs]


def multilabel_train_test_split(X, Y, size, min_count=5, seed=None):
    """ Takes a features matrix `X` and a label matrix `Y` and
        returns (X_train, X_test, Y_train, Y_test) where all
        classes in Y are represented at least `min_count` times.
    """
    index = Y.index if isinstance(Y, pd.DataFrame) else np.arange(Y.shape[0])

    test_set_idxs = multilabel_sample(Y, size=size, min_count=min_count, seed=seed)
    train_set_idxs = np.setdiff1d(index, test_set_idxs)

    test_set_mask = index.isin(test_set_idxs)
    train_set_mask = ~test_set_mask

    return (X[train_set_mask], X[test_set_mask], Y[train_set_mask], Y[test_set_mask])

In [3]:
from sklearn.preprocessing import FunctionTransformer

# Get the dummy encoding of the labels
dummy_labels = pd.get_dummies(df[LABELS])

# Get the columns that are features in the original df
NON_LABELS = [c for c in df.columns if c not in LABELS]

# Split into training and test sets
X_train, X_test, y_train, y_test = multilabel_train_test_split(df[NON_LABELS],
                                                               dummy_labels,
                                                               0.2, 
                                                               seed=123)

In [18]:
# Define combine_text_columns()
NUMERIC_COLUMNS=['FTE','Total']
def combine_text_columns(data_frame, to_drop=NUMERIC_COLUMNS + LABELS):
    """ converts all text in each row of data_frame to single vector by converting all NAN's into " " and joining on spaces"""
    
    print("\n\nThe Initial To_Drop contains")
    print("---------------------------------\n\n")
    print(to_drop)
    print('\n')
    print("\n\nThe Initial Columns of the DataFrame are:")
    print("----------------------------------------\n\n")
    print(data_frame.columns.tolist())
    print('\n')
    # Drop non-text columns that are in the df
    to_drop = set(to_drop) & set(data_frame.columns.tolist())
    print("\n\nThe Final To_Drop Contains : ")
    print("---------------------------------\n\n")
    print(to_drop)
    
    #to_drop contains the columns to be dropped(NON-TEXT Columns)
    text_data =data_frame.drop(to_drop,axis=1)
    print("\n\n THE SHAPE OF TEXT DATA")
    print("--------------------\n\n")
    print(text_data.shape)
    print("\n\nTHE TEXT DATA IS:")
    print("--------------------\n\n")
    print(text_data)
    
    
    # Replace nans with blanks
    text_data.fillna("",inplace=True)
    print("\n\n THE SHAPE OF NEW TEXT DATA")
    print("--------------------\n\n")
    print(text_data.shape)
    print("\n\nNow the New TEXT DATA IS:")
    print("-----------------------------\n\n\n")
    print(text_data)
    
    
    # Join all text items in a row that have a space in between
    return text_data.apply(lambda x: " ".join(x), axis=1)

In [20]:
# Import the CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Create the text vector
text_vector = combine_text_columns(X_train)
print("\n\n THE SHAPE OF TEXT VECTOR(Outside Function)")
print("--------------------\n\n")
print(text_vector.shape)
print("\n\nTHE TEXT VECTOR IS(Outside Function) :: ")
print("--------------------------------------\n\n")
print(text_vector)


# Create the token pattern: TOKENS_ALPHANUMERIC
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

# Instantiate the CountVectorizer: text_features
text_features =CountVectorizer(token_pattern=TOKENS_ALPHANUMERIC)

# Fit text_features to the text vector
text_features.fit(text_vector)

# Print the first 10 tokens
print(len(text_features.get_feature_names()))
for word in text_features.get_feature_names():
    print(word)



The Initial To_Drop contains
---------------------------------


['FTE', 'Total', 'Function', 'Use', 'Sharing', 'Reporting', 'Student_Type', 'Position_Type', 'Object_Type', 'Pre_K', 'Operating_Status']




The Initial Columns of the DataFrame are:
----------------------------------------


['Object_Description', 'Text_2', 'SubFund_Description', 'Job_Title_Description', 'Text_3', 'Text_4', 'Sub_Object_Description', 'Location_Description', 'FTE', 'Function_Description', 'Facility_or_Department', 'Position_Extra', 'Total', 'Program_Description', 'Fund_Description', 'Text_1']




The Final To_Drop Contains : 
---------------------------------


{'Total', 'FTE'}


 THE SHAPE OF TEXT DATA
--------------------


(320222, 14)


THE TEXT DATA IS:
--------------------


                                       Object_Description  \
134338                                                NaN   
326408                       Personal Services - Teachers   
364634                                  EMPL



 THE SHAPE OF TEXT VECTOR(Outside Function)
--------------------


(320222,)


THE TEXT VECTOR IS(Outside Function) :: 
--------------------------------------


134338       Teacher-Elementary        KINDERGARTEN  KIN...
326408    Personal Services - Teachers   TCHER 2ND GRADE...
364634    EMPLOYEE BENEFITS TEACHER SUBS GENERAL FUND Te...
47683     TEACHER COVERAGE FOR TEACHER TEACHER SUBS GENE...
229958    CONTRA BENEFITS  GENERAL FUND Custodian - PT -...
417668    EDUCATIONAL SPECIAL EDUCATION INSTRUCTION LOCA...
126378    EMPLOYEE BENEFITS  GENERAL FUND Sub Manager, F...
85262     EMPLOYEE BENEFITS TEACHER SUBS GENERAL FUND Te...
304569    EQUIPMENT *  Support Services - Administration...
84272     Personal Services - Teachers   TCHER P E (ELEM...
64760     Regular *  Special Instruction    Certificated...
21870     OTHER PERSONAL SERVICES          SUB TEACHER A...
169454    SUPPLIES                         TEACHER, TITL...
169914    Regular * ADM/PROF Operation and Maintenance o.

interactive
interagy
interdisciplinary
interest
interfund
intergov
intergovernmental
interim
interm
intermediate
intern
internal
international
internet
internl
interntl
interpreter
interrelated
interscholastic
interv
interve
intervent
intervention
interventions
intg
intl
intr
intradistrict
intramural
intrn
intructional
intrvnt
ints
inv
invent
inventory
invest
investigation
investigative
investigator
investment
invntry
involvement
involvment
involvmt
ipd
irc
ironworker
irrig
irrigation
is
isef
isp
iss
issn
issuance
issue
issues
ist
it
ita
item
itema
itemaa
itemab
itemac
itemad
itemae
itemaf
itemag
itemai
itemb
itemc
itemd
iteme
itemfa
itemfb
itemfc
itemfccentral
itemfcco
itemfd
itemfe
itemfh
itemg
itemga
itemgb
itemgc
itemgd
itemge
itemgf
itemgh
itemgi
itemha
itemhb
itemi
itemj
itemk
itemn
itemo
itemp
itempp
itempq
itemr
items
itemt
itemu
itemv
itemwz
itemx
itemy
itemya
itemz
itinerant
itinerate
itl
iv
j
jail
janitor
janitorial
japanese
jcf
jf
jhs
job
jobs
john
journalism
jr
jrotc
judgm

In [9]:
df

Unnamed: 0,Function,Use,Sharing,Reporting,Student_Type,Position_Type,Object_Type,Pre_K,Operating_Status,Object_Description,...,Sub_Object_Description,Location_Description,FTE,Function_Description,Facility_or_Department,Position_Extra,Total,Program_Description,Fund_Description,Text_1
134338,Teacher Compensation,Instruction,School Reported,School,NO_LABEL,Teacher,NO_LABEL,NO_LABEL,PreK-12 Operating,,...,,,1.000000,,,KINDERGARTEN,50471.810000,KINDERGARTEN,General Fund,
206341,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,Non-Operating,CONTRACTOR SERVICES,...,,,,RGN GOB,,UNDESIGNATED,3477.860000,BUILDING IMPROVEMENT SERVICES,,BUILDING IMPROVEMENT SERVICES
326408,Teacher Compensation,Instruction,School Reported,School,Unspecified,Teacher,Base Salary/Compensation,Non PreK,PreK-12 Operating,Personal Services - Teachers,...,,,1.000000,,,TEACHER,62237.130000,Instruction - Regular,General Purpose School,
364634,Substitute Compensation,Instruction,School Reported,School,Unspecified,Substitute,Benefits,NO_LABEL,PreK-12 Operating,EMPLOYEE BENEFITS,...,,,,UNALLOC BUDGETS/SCHOOLS,,PROFESSIONAL-INSTRUCTIONAL,22.300000,GENERAL MIDDLE/JUNIOR HIGH SCH,,REGULAR INSTRUCTION
47683,Substitute Compensation,Instruction,School Reported,School,Unspecified,Teacher,Substitute Compensation,NO_LABEL,PreK-12 Operating,TEACHER COVERAGE FOR TEACHER,...,,,,NON-PROJECT,,PROFESSIONAL-INSTRUCTIONAL,54.166000,GENERAL HIGH SCHOOL EDUCATION,,REGULAR INSTRUCTION
229958,Facilities & Maintenance,O&M,School Reported,School,Unspecified,Custodian,Benefits,NO_LABEL,PreK-12 Operating,CONTRA BENEFITS,...,,,,NON-PROJECT,,UNDESIGNATED,-8.150000,EMPLOYEE BENEFITS,,EMPLOYEE BENEFITS
417668,Instructional Materials & Supplies,Instruction,School Reported,School,Special Education,Non-Position,Supplies/Materials,NO_LABEL,PreK-12 Operating,EDUCATIONAL,...,,,,,,SUPPLIES AND MATERIALS,2000.050000,SPECIAL EDUCATION LOCAL,LOCAL FUND,
126378,Food Services,O&M,School on Central Budgets,Non-School,Unspecified,Coordinator/Manager,Benefits,NO_LABEL,PreK-12 Operating,EMPLOYEE BENEFITS,...,,DISTRICT WIDE ORGANIZATION UNI,,NON-PROJECT,,UNDESIGNATED,0.720000,UNDESIGNATED,,UNDESIGNATED
275539,Teacher Compensation,Instruction,School Reported,School,Unspecified,Teacher,Benefits,NO_LABEL,PreK-12 Operating,EMPLOYEE BENEFITS,...,,,,ELA S - TEACHING SPANISH ONLY,,PROFESSIONAL-INSTRUCTIONAL,228.250000,GENERAL ELEMENTARY EDUCATION,,REGULAR INSTRUCTION
85262,Substitute Compensation,Instruction,School Reported,School,Unspecified,Substitute,Benefits,NO_LABEL,PreK-12 Operating,EMPLOYEE BENEFITS,...,,,,UNALLOC BUDGETS/SCHOOLS,,PROFESSIONAL-INSTRUCTIONAL,69.560000,GENERAL ELEMENTARY EDUCATION,,REGULAR INSTRUCTION


## N-gram range in scikit-learn

Now we'll insert a CountVectorizer instance into our pipeline for the main dataset, and compute multiple n-gram features to be used in the model.

In order to look for ngram relationships at multiple scales, we will use the ngram_range parameter.

Special functions: We'll notice a couple of new steps provided in the pipeline in this and many of the remaining exercises. Specifically, the dim_red step following the vectorizer step , and the scale step preceeding the clf (classification) step.

These have been added in order to account for the fact that we're using a reduced-size sample of the full dataset. To make sure the models perform as the expert competition winner intended, we have to apply a dimensionality reduction technique, which is what the dim_red step does, and we have to scale the features to lie between -1 and 1, which is what the scale step does.

The dim_red step uses a scikit-learn function called SelectKBest(), applying something called the chi-squared test to select the K "best" features. The scale step uses a scikit-learn function called MaxAbsScaler() in order to squash the relevant features into the interval -1 to 1.

We won't need to do anything extra with these functions here, just complete the vectorizing pipeline steps below.

In [21]:
# Import pipeline
from sklearn.pipeline import Pipeline

# Import classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Import other preprocessing modules
from sklearn.preprocessing import Imputer
from sklearn.feature_selection import chi2, SelectKBest

# Select 300 best features
chi_k = 300

# Import functional utilities
from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler
from sklearn.pipeline import FeatureUnion

# Perform preprocessing
get_text_data = FunctionTransformer(combine_text_columns, validate=False)
get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS], validate=False)

# Create the token pattern: TOKENS_ALPHANUMERIC
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

# Instantiate pipeline: pl
pl = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer',CountVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
                                                   ngram_range=(1,2))),
                    ('dim_red', SelectKBest(chi2, chi_k))
                ]))
             ]
        )),
        ('scale', MaxAbsScaler()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])
# Fit to the training data
pl.fit(X_train, y_train)

# Compute and print accuracy
accuracy = pl.score(X_test, y_test)
print("\nAccuracy on budget dataset: ", accuracy)

## Interaction modeling in scikit-learn

It's time to add interaction features to your model. The PolynomialFeatures object in scikit-learn does just that, but here we're going to a custom interaction object, SparseInteractions. 

Interaction terms are a statistical tool that lets our model express what happens if two features appear together in the same row.
SparseInteractions does the same thing as PolynomialFeatures, but it uses sparse matrices to do so.

PolynomialFeatures and SparseInteractions both take the argument degree, which tells them what polynomia degree of interactions to compute.

We're going to consider interaction terms of degree=2 in our pipeline.. We will insert these steps after the preprocessing steps we've built out so far, but before the classifier steps.

Pipelines with interaction terms take a while to train (since we're making n features into n-squared features!), so as long as we set it up right, we'll do the heavy lifting and tell  what our score is!

In [24]:
from itertools import combinations

import numpy as np
from scipy import sparse
from sklearn.base import BaseEstimator, TransformerMixin


class SparseInteractions(BaseEstimator, TransformerMixin):
    def __init__(self, degree=2, feature_name_separator="_"):
        self.degree = degree
        self.feature_name_separator = feature_name_separator

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not sparse.isspmatrix_csc(X):
            X = sparse.csc_matrix(X)

        if hasattr(X, "columns"):
            self.orig_col_names = X.columns
        else:
            self.orig_col_names = np.array([str(i) for i in range(X.shape[1])])

        spi = self._create_sparse_interactions(X)
        return spi

    def get_feature_names(self):
        return self.feature_names

    def _create_sparse_interactions(self, X):
        out_mat = []
        self.feature_names = self.orig_col_names.tolist()

        for sub_degree in range(2, self.degree + 1):
            for col_ixs in combinations(range(X.shape[1]), sub_degree):
                # add name for new column
                name = self.feature_name_separator.join(self.orig_col_names[list(col_ixs)])
                self.feature_names.append(name)

                # get column multiplications value
                out = X[:, col_ixs[0]]
                for j in col_ixs[1:]:
                    out = out.multiply(X[:, j])

                out_mat.append(out)

        return sparse.hstack([X] + out_mat)

In [25]:
# Instantiate pipeline: pl
pl = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', CountVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
                                                   ngram_range=(1, 2))),  
                    ('dim_red', SelectKBest(chi2, chi_k))
                ]))
             ]
        )),
        ('int',SparseInteractions(degree=2)),
        ('scale', MaxAbsScaler()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])
# Fit to the training data
pl.fit(X_train, y_train)

# Compute and print accuracy
accuracy = pl.score(X_test, y_test)
print("\nAccuracy on budget dataset: ", accuracy)

## Hashing

Hash function takes an input, in our case a token, and outputs a hash value. For example, the input may be a string and the hash value may be an integer.

Some problems are memory-bound and not easily parallelizable, and hashing enforces a fixed length computation instead of using a mutable datatype (like a dictionary).

By explicitly stating how many possible outputs the hashing function may have, we limit the size of the objects that need to be processed. With these limits known, computation can be made more efficient and we can get results faster, even on large datasets.


HashingVectorizer acts just like CountVectorizer in that it can accept token_pattern and ngram_range parameters. The important difference is that it creates hash values from the text, so that we get all the computational advantages of hashing!

In [26]:
# Import HashingVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

# Get text data: text_data
text_data = combine_text_columns(X_train)

# Create the token pattern: TOKENS_ALPHANUMERIC
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)' 

# Instantiate the HashingVectorizer: hashing_vec
hashing_vec =HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC)

# Fit and transform the Hashing Vectorizer
hashed_text =hashing_vec.fit_transform(text_data)

# Create DataFrame and print the head
hashed_df = pd.DataFrame(hashed_text.data)
print(hashed_df.head())



The Initial To_Drop contains
---------------------------------


['FTE', 'Total', 'Function', 'Use', 'Sharing', 'Reporting', 'Student_Type', 'Position_Type', 'Object_Type', 'Pre_K', 'Operating_Status']




The Initial Columns of the DataFrame are:
----------------------------------------


['Object_Description', 'Text_2', 'SubFund_Description', 'Job_Title_Description', 'Text_3', 'Text_4', 'Sub_Object_Description', 'Location_Description', 'FTE', 'Function_Description', 'Facility_or_Department', 'Position_Extra', 'Total', 'Program_Description', 'Fund_Description', 'Text_1']




The Final To_Drop Contains : 
---------------------------------


{'Total', 'FTE'}


 THE SHAPE OF TEXT DATA
--------------------


(320222, 14)


THE TEXT DATA IS:
--------------------


                                       Object_Description  \
134338                                                NaN   
326408                       Personal Services - Teachers   
364634                                  EMPL



 THE SHAPE OF NEW TEXT DATA
--------------------


(320222, 14)


Now the New TEXT DATA IS:
-----------------------------



                                       Object_Description  \
134338                                                      
326408                       Personal Services - Teachers   
364634                                  EMPLOYEE BENEFITS   
47683                        TEACHER COVERAGE FOR TEACHER   
229958                                    CONTRA BENEFITS   
417668                                        EDUCATIONAL   
126378                                  EMPLOYEE BENEFITS   
85262                                   EMPLOYEE BENEFITS   
304569                                        EQUIPMENT *   
84272                        Personal Services - Teachers   
64760                                           Regular *   
21870                      OTHER PERSONAL SERVICES          
169454                     SUPPLIES                         
169914             

          0
0  0.377964
1  0.755929
2  0.377964
3  0.377964
4  0.235702


Some text is hashed to the same value, but this doesn't neccessarily hurt performance.

In [None]:
# Import the hashing vectorizer
from sklearn.feature_extraction.text import HashingVectorizer
# Instantiate the winning model pipeline: pl
pl = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
                                                     non_negative=True, norm=None, binary=False,
                                                     ngram_range=(1,2))),
                    ('dim_red', SelectKBest(chi2, chi_k))
                ]))
             ]
        )),
        ('int', SparseInteractions(degree=2)),
        ('scale', MaxAbsScaler()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])




The Initial To_Drop contains
---------------------------------


['FTE', 'Total', 'Function', 'Use', 'Sharing', 'Reporting', 'Student_Type', 'Position_Type', 'Object_Type', 'Pre_K', 'Operating_Status']




The Initial Columns of the DataFrame are:
----------------------------------------


['Object_Description', 'Text_2', 'SubFund_Description', 'Job_Title_Description', 'Text_3', 'Text_4', 'Sub_Object_Description', 'Location_Description', 'FTE', 'Function_Description', 'Facility_or_Department', 'Position_Extra', 'Total', 'Program_Description', 'Fund_Description', 'Text_1']




The Final To_Drop Contains : 
---------------------------------


{'Total', 'FTE'}


 THE SHAPE OF TEXT DATA
--------------------


(320222, 14)


THE TEXT DATA IS:
--------------------


                                       Object_Description  \
134338                                                NaN   
326408                       Personal Services - Teachers   
364634                                  EMPL



 THE SHAPE OF NEW TEXT DATA
--------------------


(320222, 14)


Now the New TEXT DATA IS:
-----------------------------



                                       Object_Description  \
134338                                                      
326408                       Personal Services - Teachers   
364634                                  EMPLOYEE BENEFITS   
47683                        TEACHER COVERAGE FOR TEACHER   
229958                                    CONTRA BENEFITS   
417668                                        EDUCATIONAL   
126378                                  EMPLOYEE BENEFITS   
85262                                   EMPLOYEE BENEFITS   
304569                                        EQUIPMENT *   
84272                        Personal Services - Teachers   
64760                                           Regular *   
21870                      OTHER PERSONAL SERVICES          
169454                     SUPPLIES                         
169914             

In [None]:
# Fit to the training data
pl.fit(X_train, y_train)

# Compute and print accuracy
accuracy = pl.score(X_test, y_test)
print("\nAccuracy on budget dataset: ", accuracy)