In [11]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk

# Load the dataset
file_path = './Combined_News_DJIA.csv'  # Replace with your file path
df = pd.read_csv(file_path)

# NLTK download
nltk.download('punkt')
nltk.download('stopwords')

# Text Preprocessing Function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    text = text.lstrip('b')

    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    filtered_tokens = [token for token in tokens if token not in set(stopwords.words('english'))]
    # Stemming
    ps = PorterStemmer()
    stemmed_tokens = [ps.stem(token) for token in filtered_tokens]
    return ' '.join(stemmed_tokens)

# Combine all news columns into one
df['Combined_News'] = df[df.columns[2:]].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

# Apply preprocessing
df['Processed_News'] = df['Combined_News'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /Users/lionvsx/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lionvsx/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                      Processed_News
0  georgia down two russian warplan countri move ...
1  wont america nato help us wont help us help ir...
2  rememb ador yearold sang open ceremoni fake br...
3  us refus israel weapon attack iran report bwhe...
4  expert admit legalis drug bwar south osetia pi...


In [14]:
df[['Processed_News']].head(100)

Unnamed: 0,Processed_News
0,georgia down two russian warplan countri move ...
1,wont america nato help us wont help us help ir...
2,rememb ador yearold sang open ceremoni fake br...
3,us refus israel weapon attack iran report bwhe...
4,expert admit legalis drug bwar south osetia pi...
...,...
95,american express get billion capit one get bil...
96,hell earth citi shipbreak bfull text iranian p...
97,reportedli kill isra strike gaza bjust remind ...
98,today israel take entir apt build civilian kil...


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Define the pipeline
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # Handle missing values in text
    ('tfidf', TfidfVectorizer()),  # Convert text to TF-IDF features
    ('classifier', RandomForestClassifier())  # ML model
])

# Assuming 'Processed_News' is your preprocessed text data
X = df['Processed_News']  
y = df['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

ValueError: Expected 2D array, got 1D array instead:
array=['libyan forc captur gaddafi local tv report whistleblow murder india illustr fatal hazard expos corrupt report undercov polic delay alleg polic chief authoris undercov polic offic give fals evid court russia press un resolut aim keep thousand shoulderfir surfacetoair missil purchas former libyan leader moammar gadhafi get hand arm group terrorist time un formal apolog legitim gaddafi regim elect libya human right council last year secur council gener assembl presid muammar gaddafi kill celebr chines linguist dissid zhou youguang ordinari peopl longer believ communist parti peopl ask there hope china im optimist didnt even lose hope japanes occup world war ii jew religion yoram kaniuk novelist win court victori israel jewish nation atheist religion sound like progress rais new question church state relat mean jewish syria protest mutat arm resist amid econom meltdown gm crop fail increas yield vastli increas use chemic growth superwe studi accus monsanto control indian cotton seed market massiv push price libya end qaddafi fall sirt pic author knew rupert murdoch news world tabloid hack phone murder teenag milli dowler nine year scandal practic explod gaddafi remov truck appar aliv video bolivian nativ triumphantli enter la paz month march amazon oppos construct highway homeland road built pristin isiboro secur nation park amp indigen territori inhabit nativ peopl polic spi unit cross line say lord macdonald former director public prosecut accus metropolitan polic monument misjudg allow undercov offic give fals evid injuri leg torso head former dictat found alon drain pipe surround bodi guard shot fled sirt west mainstream press account australia make case piraci problem venezuelan suprem court today basic belong presid chvez ever sinc support pack court rule repeatedli sought protect presid polit agenda uphold basic human right benjamin netanyahu ridicul israel attempt capit releas gilad shalit rate agenc forecast throw entir countri financi crise drive interest rate bond eu consid ban rate countri process negoti econom bailout packag mexican polic head shelter charg prostitut women resid cnncom basqu group eta end arm independ campaign rise fall muammar gaddafi pic qaddafi death leav libya must build scratch muammar qaddafi govern one kind independ institut mean libya new govern noth good bad build canada suprem court rule hyperlink websit link defamatori content constitut public shield content sharer liabil inadvert post link libel materi'
 'netherland decrimin insult head state iranian women call western tourist violat headscarf law fight oppress swiss armi readi act worst case migrant scenario switzerland said readi mobil soldier border cross migrant continu arriv countri russian engin use dog robot success clear ancient syrian citi palmyra nearli explos devic left behind islam state yearold romanera mosaic found southern turkey say cheer enjoy life edward snowden sue norway bid secur free passag norway without extradit us want come norway accept freedom speech award norwegian fighter jet help save die patient hospit staff call air forc f transport vital equip trondheim hospit patient mile away us armi captain foil denmark school attack us armi captain iraq help crack plot bomb danish school glean crucial detail document recov extremist pentagon said wednesday olymp bike lane collaps rio least die bori johnson suggest partkenyan obama may ancestr dislik britain barcelona polic raid arrest chines mafia free traffick chines women us charg woman export underwat drone technolog china kindli send us name quot insult presid ankara consul ask dutch citizen decrimin drug busi world leader tell un us soldier help foil plot blow school denmark brussel bomber identifi jailer foreign isi hostag najim laachraoui islam state guard known abou idriss accord lawyer former syria hostag nicola henin hong kong bar may offer men night court deem ladi night promo discriminatori mexico consid cannabi legal turkish hotlin erdogan insult anger dutch quak strike ecuador amid recoveri effort germani arrest two teenag suspect bomb sikh templ de teen salafist arrest bomb attack sikh templ uk use massiv dataset spi innoc civilian year blow pope reform audit vatican financ forc halt russian forc fire isra militari aircraft syria'
 'shouldnt outrag lie wikileak almost silenc publish video bmsnbc block search wikileak bwikileak founder msnbc tape includ kill civilian afghanistan releas soon bhi reddit dont know get upvot leav pictur earthquak hometown citi mexicali baja california bthe famili murder reuter photograph video burst tear last night saw leak video first time ban independ expert assess wikileak footag veteran anthoni martinez point support action crazyhors come van arriv somewher around engag bi believ youv kill church holi father moral bankrupt pope could call news role child abus coverup petti gossip bmagnitud quak shake indonesia bgreenwald iraq slaughter aberr bon thursday world bank vote whether grant south african state own util bn loan larg chunk use financ contract compani front rule anc parti pleas help expos corrupt world bank make right decis b kill maoist ambush indian crpf team time india bdemocraci depth coverag wikileak video bnorth korean red star oper system detail emerg bwayn leari hero father save year old daughter push away speed car die instead bnigerian polic fail ironi kill protestor rncop brutal bthi realli confirm weve said along transpar investig incid hasnt taken place need iraqi respond collater murder bit took coupl day report find someon el paso escort jurez mexico drugrel crime rage recent final saw bmap made reuter event lead collater murder video bat least indian soldier kill maoist ambush bthe sex abus scandal face cathol look lot like waterg cathol church similar pathway resign pope bgordon brown call may gener elect bcanadian research uncov vast shadow network onlin espionag base china use seemingli harmless mean email twitter extract highli sensit data comput around world b weird world waziristan nrambl alway interest account social chang afghanistan especi modernis king amanullah histor embed video includ mullah omar prophet cloakn bu fight taliban heavi metal music bmother children wound apach attack kill father sell home video show children'
 ...
 'iran gear greatest wargam ever strait hormuz damascu bomber kill wound copyright religion kopim offici recogn sweden europ biggest free wifi zone set london iran mount new web crackdown new rule impos layer surveil countri popular internet cafe tehran polit establish come increas strain econom turmoil threat intern sanction zimbabw femal rapist accus semen harvest strike gang femal rapist prowl sexual assault collect sperm male hitchhik pirat parti board german polit peopl tire mainstream syrian defector say govern lost control human monster turkey arrest former chiefofstaff journalist plot overthrow govern haiti cholera death toll near nov one largest cholera outbreak modern histori affect singl countri ethiopia troop quit somalia report syria troop fire arab leagu monitor dead south sudan pibor counti ethnic massacr israel polic struggl suppress jewish extremist west bank say senior offic iran call threat sanction european union econom war beij cultur war isnt usit china futur xpost rchina doctor resid blame us weapon catastroph level birth defect fallujah newborn third rate agenc downgrad hungari junk news intern phone hack rebekah brookss former pa arrest iranian brace war western nation buy critic suppli store mad north korea entir peopl go mad sometim certainli seem turkey arrest former armi chief plot bring govern rumor n korean nuclear explos prompt brief stock panic south us court allow ecuador collect b judgement chevron toxic wast dump qaddafi come lead amnesti intern pull onlin poll human right hero'
 'judg question boy charg underag sex case iss crew russia usa germani hug take selfi say polit ukrain tension space nobodi want host olymp investig alleg mass surveil german citizen investig nsa alleg spi chancellor angela merkel phone german feder prosecutor decid bank england governor capit doom ethic vanish china us breach intern law infring human right put global cybersecur risk demand halt unscrupul us cyberspi bloodi war water mexico resid town outsid mexico citi injur polic fight defend natur spring tap fugit us intellig leaker edward snowden describ train spi specialis electron surveil dismiss claim mere lowlevel analyst interview nbc reiter work undercov oversea cia nsa per cent drug seizur europ cannabi facebook want listen your glenn greenwald nsa document middl east disclos gaza hama govern say readi step asid hand full respons palestinian uniti govern zambian park ranger charg law enforc shot dead shot death suspect poacher investig report gunshot rebel kill new ukrain leader unleash assault ukrainian aircraft paratroop kill prorussian rebel assault rage second day tuesday newli elect presid vow crush revolt east leg pant qatar tell tourist russia offer mln free financi aid syria googl start build selfdriv car car stopgo button control steer wheel pedal humpback whale strand w australia malnutrit major factor unpreced number mostli young whale becom strand coast brazil antiworld cup protest clash polic polic brazil fire tear ga antiworld cup indigen demonstr capit brasilia bangkok post report facebook block thailand median ceo pay cross million us tell citizen leav libya immedi russia see busi interest iss doubt mission continu past racism rise britain british social attitud survey find proport peopl uk say racial prejud risen sinc iran order zionist zuckerburg appear court'
 'franc deport radic muslim cleric julian assang run seat australian senat senior british rabbi film tell alleg child abus victim go polic korea space rocket launch appar success becom th countri reach space soil yonhap news franc deport radic foreign imam franc interior minist say move part fight global jihad timbuktu ahm baba institut manuscript save accord local beij cancel flight pollut hit beyond index level african billionair give away half money payback time mali resid hunt beat suspect islamist extremist newli liber town dutch court reject shell spill case judg reject landmark case brought nigerian farmer royal dutch shell poison fish pond amp farmland leak pipelin isra forc attack target syrianlebanes border report heavi presenc isra jet lebanon territori secretarygener unit nation say situat syria catastroph appeal side particularli syrian govern stop kill name human stop kill stop violenc germani mark th anniversari adolf hitler rise power solemn ceremoni memori site heart nazi berlin shamil jeppi timbuktu manuscript project director univers cape town said manuscript said region safe man face anim cruelti charg dead cat found apart windsor ont zimbabw bank russia pull us drug agreement decadeold drug control deal address today realiti amp exhaust potenti seen latest sign deterior tie sinc presid putin return kremlin innov colleg homeless peopl london attract hundr student british internet user person inform major cloud storag servic spi upon routin us author egypt state brink collaps chines love himalayan viagra caus shortag overharvest amp threaten ecolog damag world expens medicin fungu seriou declin symbol wealth amp power china amp economi boom fungu enjoy unpreced popular gay chocolati forc remot scottish villag driven suffer decad homophob antienglish abus mexico raid alleg sex slaveri cult border detain foreign includ spaniard famili find pet tortois miss year clean dead father storeroom china bewar camera may watch estim million surveil camera china everi peopl offici say camera help fight crime amp maintain social stabil critic say theyr use monitor amp intimid dissid'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.